Ejemplo n.º 1
0
void FGroup_Validate( SLNode *node, void *data )
{
    rc_t* rc = (rc_t*)data;
    static EIlluminaNativeFileType mask = eIlluminaNativeFileTypeNotSet;
    FGroup* n = (FGroup*)node;
    IlluminaFileInfo* file = n->files;

    DEBUG_MSG(3, ("==> group: '%s'\n", n->key.data));
    while( file != NULL ) {
        DEBUG_MSG(3, ("file: type %u '%s'\n", file->type, file->name.data));
        if( mask == eIlluminaNativeFileTypeNotSet ) {
            mask = n->mask;
            if( !(mask & (eIlluminaNativeFileTypeFasta | eIlluminaNativeFileTypeQSeq)) ) {
                *rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcNotFound);
                SRALoaderFile_LOG(file->file, klogErr, *rc, "file group '$(p)*': sequence data", PLOG_S(p) , n->key.data);
            }
            if( (mask & eIlluminaNativeFileTypeFasta) && (mask & eIlluminaNativeFileTypeQSeq) ) {
                *rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
                SRALoaderFile_LOG(file->file, klogErr, *rc, "file group '$(p)*': _seq and _qseq", PLOG_S(p) , n->key.data);
            }
            if( !(mask & eIlluminaNativeFileTypeQuality4) && !(mask & eIlluminaNativeFileTypeQSeq) ) {
                *rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcNotFound);
                SRALoaderFile_LOG(file->file, klogErr, *rc, "file group '$(p)*': quality data", PLOG_S(p) , n->key.data);
            }
        } else if( mask != n->mask ) {
            *rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent);
            SRALoaderFile_LOG(file->file, klogErr, *rc, "file group '$(p)*': no match in spot names on 1st lines across files in group",
                              PLOG_S(p) , n->key.data);
        }
        file = file->next;
    }
    DEBUG_MSG(3, ("<== group: '%s'\n", n->key.data));
}
Ejemplo n.º 2
0
static
rc_t parse_header(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size)
{
    rc_t rc = 0;
    size_t parsed;
    char prefixType;
    uint32_t counter;
    ztr_raw_t ztr_raw;
    ztr_t ztr;
    enum ztr_chunk_type type;
    fe_context_t* fe = (fe_context_t*)ctx;
    
    rc = SRF_ParseDataChunk(data, size, &parsed, &fe->name_prefix, &prefixType, &counter);
    if(rc) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInvalid);
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
    }
    if(fe->defered != NULL) {
        free((void *)fe->defered);
        fe->defered = NULL;
    }
    if(parsed == size)
        return 0;
    
    rc = ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed);
    if(rc)
        return rc;

    if((rc = ZTR_ParseHeader(ztr_ctx)) != 0) {
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
    }

    while (rc == 0 && !ZTR_BufferIsEmpty(ztr_ctx)) {
        if((rc = ZTR_ParseBlock(ztr_ctx, &ztr_raw)) != 0) {
            if(GetRCState(rc) == rcInsufficient && GetRCObject(rc) == (enum RCObject)rcData)
                rc = ZTR_BufferGetRemainder(ztr_ctx, &fe->defered, &fe->defered_len);
            break;
        }

        if((rc = ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0) {
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
            break;
        }
        if(*(void **)&ztr != NULL)
            free(*(void **)&ztr);

        if(ztr_raw.meta != NULL)
            free(ztr_raw.meta);
        if(ztr_raw.data != NULL)
            free(ztr_raw.data);
    }
    return rc;
}
Ejemplo n.º 3
0
static
rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId )
{
    rc_t rc;
    char *suffix;
    pstring readName, spotGroup;
    static IlluminaSpot spot;

    /* look for spot group */
    suffix = strchr(readId->data, '#');
    if( suffix != NULL ) {
        readId->len = suffix++ - readId->data;
        if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) {
            SRALoaderFile_LOG(self->ctx.file, klogInt, rc,
                "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data);
            return rc;
        }
    } else {
        pstring_clear(&spotGroup);
    }

    /* build the read name from prefix (self->name_prefix) and read id */
    if(self->name_prefix.len > 0 ) {
        if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) {
            if( isdigit(readName.data[readName.len - 1]) ) {
                rc = pstring_append(&readName, ":", 1);
            }
            if( rc == 0 ) {
                rc = pstring_concat(&readName, readId);
            }
        }
    } else {
        rc = pstring_copy(&readName, readId);
    }
    if( rc != 0 ) {
        SRALoaderFile_LOG(self->ctx.file, klogErr, rc,
            "preparing spot name $(spotname)", "spotname=%s", readId->data);
        return rc;
    }
    SRF_set_read_filter(&self->read.filter, flags);

    IlluminaSpot_Init(&spot);
    if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) {
        rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot);
    }
    return rc;
}
Ejemplo n.º 4
0
static
rc_t SFFLoaderFmtWriteDataFile(SFFLoaderFmt* self, const SRALoaderFile* file)
{
    rc_t rc = 0;

    while( rc == 0 ) {
        if( self->curr_read_number == 0 ) {
            if( (rc = SFFLoaderFmtReadCommonHeader(self, file)) == 0 ) {
                DEBUG_MSG (5, ("%s: Common header ok: %u reads\n", self->file_name, self->header.number_of_reads));
                DEBUG_MSG (8, ("%s: flow_chars: [%hu] %s\n", self->file_name, self->header.num_flows_per_read, self->flow_chars.data));
                DEBUG_MSG (8, ("%s: key_seq: [%hu] %s\n", self->file_name, self->header.key_length, self->key_seq.data));
            } else if( GetRCObject(rc) == (enum RCObject)rcData && GetRCState(rc) == rcIgnored ) {
                rc = 0;
                break;
            }
        }
        if( rc == 0 && self->header.number_of_reads != 0 && 
            (rc = SFFLoaderFmtSkipIndex(self, file)) == 0 &&
            (rc = SFFLoaderFmtReadDataHeader(self, file)) == 0 &&
            (rc = SFFLoaderFmtReadData(self, file)) == 0 ) {
            if( self->w454 ) {
                rc = SRAWriter454_WriteRead(self->w454, file, &self->name, &self->read, &self->quality,
                                         self->skip_signal ? NULL : &self->signal,
                                         self->skip_signal ? NULL : &self->position,
                                         self->read_header.clip_quality_left, self->read_header.clip_quality_right,
                                         self->read_header.clip_adapter_left, self->read_header.clip_adapter_right);
            } else {
                rc = SRAWriterIonTorrent_WriteRead(self->wIonTorrent, file, &self->name, &self->read, &self->quality,
                         self->skip_signal ? NULL : &self->signal,
                         self->skip_signal ? NULL : &self->position,
                         self->read_header.clip_quality_left, self->read_header.clip_quality_right,
                         self->read_header.clip_adapter_left, self->read_header.clip_adapter_right);
            }
            if( rc == 0 ) {
                ++self->curr_read_number;
            }
        }
        if( rc != 0 && (GetRCObject(rc) != rcTransfer && GetRCState(rc) != rcDone) ) {
            SRALoaderFile_LOG(file, klogErr, rc, "on or about read #$(i)", PLOG_U32(i), self->curr_read_number + 1);
        } else if( self->curr_read_number == self->header.number_of_reads ) {
            DEBUG_MSG(5, ("%s: done loading declared %u reads\n", self->file_name, self->curr_read_number));
            self->curr_read_number = 0;
            /* will skip indexes if they are at eof */
            if( (rc = SFFLoaderFmtSkipIndex(self, file)) == 0 ) {
                /* This should be the end of file and/or beginning of next */
                if( (rc = SFFLoaderFmt_ReadBlock(self, file, 0, "EOF", false)) == 0 ) {
                    if( self->file_buf == NULL ) {
                        DEBUG_MSG(5, ("%s: EOF detected\n", self->file_name));
                        self->index_correction = 0;
                        break;
                    }
                }
            }
        }
    }
    return rc;
}
Ejemplo n.º 5
0
static
rc_t SFFLoaderFmtReadDataHeader(SFFLoaderFmt* self, const SRALoaderFile* file)
{
    rc_t rc = 0;
    uint16_t head_sz = 0;

    /* Make sure the entire fixed portion of Read Header section is in the file buffer window */
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFReadHeader_size, "read header", false)) != 0 ) {
        return rc;
    }
    memcpy(&self->read_header, self->file_buf, SFFReadHeader_size);
#if __BYTE_ORDER == __LITTLE_ENDIAN
    self->read_header.header_length = bswap_16(self->read_header.header_length);
    self->read_header.name_length = bswap_16(self->read_header.name_length);
    self->read_header.number_of_bases = bswap_32(self->read_header.number_of_bases);
    self->read_header.clip_quality_left = bswap_16(self->read_header.clip_quality_left);
    self->read_header.clip_quality_right = bswap_16(self->read_header.clip_quality_right);
    self->read_header.clip_adapter_left = bswap_16(self->read_header.clip_adapter_left);
    self->read_header.clip_adapter_right = bswap_16(self->read_header.clip_adapter_right);
#endif

    head_sz = SFFReadHeader_size + self->read_header.name_length;
    head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0;
    if( head_sz != self->read_header.header_length ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid);
        SRALoaderFile_LOG(file, klogErr, rc, "read header length $(h) != $(s)", PLOG_2(PLOG_U16(h),PLOG_U16(s)),
                          self->header.header_length, head_sz);
        return rc;
    }
    /* read name */
    self->file_advance = SFFReadHeader_size;
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFReadHeader_size, "read header", false)) != 0) {
        return rc;
    }
    self->file_advance = head_sz - SFFReadHeader_size;

    if( (rc = pstring_assign(&self->name, self->file_buf, self->read_header.name_length)) != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "copying read name", NULL);
    }
    return rc;
}
Ejemplo n.º 6
0
static
rc_t fe_new_region(fe_context_t *self, size_t region_count, const region_t region[])
{
    rc_t rc = 0;
    int i;

    self->region.nreads = region_count / 2;
    DEBUG_MSG(3, ("REGION: %u -> %u reads\n", region_count, self->region.nreads));
    if( self->region.nreads <= 0 || self->region.nreads > ABSOLID_FMT_MAX_NUM_READS ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported);
        SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "read count $(c)", PLOG_U8(c), self->region.nreads);
    }
    for(i = 0; rc == 0 && i < self->region.nreads ; i++ ) {
        int j = i * 2 + 1;
        self->region.start[i] = region[j].start;
        if( (rc = set_label_type(region[j].name, &self->region.label[i], &self->region.type[i])) != 0 ) {
            break;
        }
        self->region.cs_key[i] = region[j - 1].name[0];
        DEBUG_MSG(3, ("REGION[%u]: '%s', %u, '%c', start: %u\n",
                      i, self->region.label[i].data, self->region.type[i], self->region.cs_key[i], self->region.start[i]));
        switch(region[j].type) {
            case Biological:
            case Normal:
            case Paired:
            case Technical:
                break;
            default:
                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "read #$(read_id) type mismatch; expected $(expected), got $(got)",
                        "read_id=%u,expected=%s,got=%u", i, "(B|N|P|T)", region[j].type);
                return rc;
        }
    }
    if( rc == 0 &&
        self->region.nreads > 1 && self->region.type[0] == self->region.type[1] ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcDuplicate);
        SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "both reads have same type", NULL);
    }
    return rc;
}
Ejemplo n.º 7
0
static
rc_t read_spot_data_3lines(FastqFileInfo* file, FileReadData* sd, uint8_t best_word, uint8_t best_score, int qualType)
{
    rc_t rc = 0;

    file->line = NULL; /* discard defline */
    /* read sequence */
    if( (rc = read_multiline_seq_or_qual(file, '+', &sd->read.seq)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence data");
    }
    if( !pstring_is_fasta(&sd->read.seq) ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected sequence data");
    }
    /* next defline */
    if( (rc = file_read_line(file, false)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
    }
    if( file->line[0] != '+' ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '+' on quality defline");
    }
    if( file->line_len != 1 ) { /* there may be just '+' on quality defline */
        FileReadData d;
        uint8_t score = parse_spot_name(file->file, &d, &file->line[1], file->line_len - 1, best_word);
        /* sometimes quality defline may NOT contain barcode and readid, so score will be lower than bestscore,
           but must be at least == 1 with none empty line, which means that name was found */
        if( score < 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not found");
        }
        if( pstring_cmp(&sd->name, &d.name) != 0 ||
            (score == best_score && (pstring_cmp(&sd->barcode, &d.barcode) != 0 || sd->read.read_id != d.read.read_id)) ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality defline do not match sequence defline");
        }
    }
    file->line = NULL; /* discard defline */
    if( (rc = read_multiline_seq_or_qual(file, '@', &sd->read.qual)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=failed to read quality");
    }
    if( sd->read.qual.len <= 0 ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcEmpty);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality");
    }
    sd->read.qual_type = qualType;
    sd->ready = true;
    return 0;
}
Ejemplo n.º 8
0
static
rc_t parse_v1_header(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size)
{
    rc_t rc = 0;
    size_t parsed;
    char prefixType;
    uint32_t counter;
    ztr_raw_t ztr_raw;
    ztr_t ztr;
    enum ztr_chunk_type type;
    fe_context_t* fe = (fe_context_t*)ctx;
    
    if( (rc = SRF_ParseDataChunk(data, size, &parsed, &fe->name_prefix, &prefixType, &counter)) != 0 ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInvalid);
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parse_v1_header - failed to parse SRF chunk", NULL);
    }
    DEBUG_MSG(3, ("HEADER PREFIX: '%s'\n", fe->name_prefix.data));
    if((rc = ABI_ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed)) != 0) {
        return rc;
    }
    if((rc = ABI_ZTR_ParseHeader(ztr_ctx)) != 0) {
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parse_v1_header - failed to parse ZTR header", NULL);
    }
    while(rc == 0 && !ABI_ZTR_BufferIsEmpty(ztr_ctx)) {
        if((rc = ABI_ZTR_ParseBlock(ztr_ctx, &ztr_raw)) != 0) {
            return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parse_v1_header - failed to parse ZTR chunk", NULL);
        }
        if((rc = ABI_ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0) {
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "parse_v1_header - failed to process ZTR chunk", NULL);
        }
        if(type == REGN) {
            rc = fe_new_region(fe, ztr.region->count, ztr.region->region);
        }
        if(*(void **)&ztr != NULL) {
            free(*(void **)&ztr);
        }
    }
    return rc;
}
Ejemplo n.º 9
0
static
rc_t SFFLoaderFmt_ReadBlock(SFFLoaderFmt* self, const SRALoaderFile* file, size_t size, const char* location, bool silent)
{
    size_t read = 0;
    rc_t rc = SRALoaderFileRead(file, self->file_advance, size, (const void**)&self->file_buf, &read);
    self->file_advance = 0;
    if( rc == 0 && (size > 0 && (self->file_buf == NULL || read < size)) ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInsufficient);
    }
    if( rc != 0 && !silent ) {
        SRALoaderFile_LOG(file, klogErr, rc, "$(l), needed $(needed) got $(got) bytes",
                PLOG_3(PLOG_S(l),PLOG_U32(needed),PLOG_U32(got)), location, size, read);
    }
    return rc;
}
Ejemplo n.º 10
0
/*
 * in a single line form tries to grab last to chunks defined by sep into seq and qual
 * ignores spaces adjucent to sep
 * normally line would look like "name sep seq sep sep qual"
 */
static
bool find_seq_qual_by_sep(FastqLoaderFmt* self, FastqFileInfo* file, const char sep)
{
    const char* seq = NULL, *qual = NULL;
    size_t seq_len = 0, qual_len = 0;

    FileReadData_init(file->spot, false);
    qual = memrchr(file->line, sep, file->line_len);
    if( qual != NULL ) {
        seq = memrchr(file->line, sep, qual - file->line);
        if( seq != NULL ) {
            if( parse_spot_name(file->file, file->spot, file->line, seq - file->line, 1) != 0 ) {
                /* skip leading spaces */
                do {
                    seq = seq + 1;
                } while( *seq == ' ' && seq < (file->line + file->line_len) );
                seq_len = qual - seq;
                do {
                    qual = qual + 1;
                } while( *qual == ' ' && qual < (file->line + file->line_len)  );
                qual_len = file->line_len - (qual - file->line);
                if( *seq != sep && *seq != ' ' && seq_len != 0 &&
                    *qual != sep && *qual != ' ' && qual_len != 0 ) {
                    if( match_seq_to_qual(seq, seq_len, qual, qual_len) ) {
                        rc_t rc;
                        if( (rc = pstring_assign(&file->spot->read.seq, seq, seq_len)) == 0 ) {
                            if( pstring_is_fasta(&file->spot->read.seq) ) {
                                if( (rc = pstring_assign(&file->spot->read.qual, qual, qual_len)) == 0 ) {
                                    file->spot->read.qual_type = file->qualType;
                                    return true;
                                }
                            }
                            file->spot->read.seq.len = 0;
                        }
                        if( rc != 0 ) {
                            SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=storing read data");
                        }
                    }
                }
            }
        }
    }
    return false;
}
Ejemplo n.º 11
0
static
rc_t parse_v1_read(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size)
{
    rc_t rc = 0;
    size_t i, parsed;
    ztr_raw_t ztr_raw;
    ztr_t ztr;
    enum ztr_chunk_type type;
    fe_context_t* fe = (fe_context_t*)ctx;

    uint8_t flags;
    pstring readId;
    EAbisolidReadType read_type;
    pstring label;

    AbsolidRead read[ABSOLID_FMT_MAX_NUM_READS];
        
    if( fe->region.nreads == 0 ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcNotFound);
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing region chunk before 1st read chunk", NULL);
    }
    if( (rc = SRF_ParseReadChunk(data, size, &parsed, &flags, &readId)) != 0 ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rc);
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "SRF parsing failure", NULL);
    }
    ABI_ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed);

    /* readId will have spotname */
    if( (rc = fe_new_read(fe, &readId, &read_type, &label)) != 0 ) {
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parsing spot name suffix", NULL);
    }
    for(i = 0; i < sizeof(read) / sizeof(read[0]); i++) {
        AbsolidRead_Init(&read[i]);
    }
    while(!ABI_ZTR_BufferIsEmpty(ztr_ctx)) {
        if( (rc = ABI_ZTR_ParseBlock(ztr_ctx, &ztr_raw)) != 0 ||
            (rc = ABI_ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0 ) {
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "ZTR parsing failure", NULL);
            break;
        }
        switch (type) {
        case BASE:
            if(ztr.sequence->datatype != i8) {
                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "read: expected 8-bit datatype", NULL);
            } else if( read_type > eAbisolidReadType_SPOT ) {
                int read_number = AbisolidReadType2ReadNumber[read_type];
                if( (rc = pstring_assign(&read[read_number].seq, ztr.sequence->data, ztr.sequence->datasize)) == 0 ) {
                    /* grab 1st, may be the only cs_key */
                    read[read_number].cs_key = fe->region.cs_key[0];
                    for(i = 1; i < fe->region.nreads; i++) {
                        if( read_type == fe->region.type[i] ) {
                            read[read_number].cs_key = fe->region.cs_key[i];
                            break;
                        }
                    }
                    SRF_set_read_filter(&read[read_number].filter, flags);
                    rc = pstring_copy(&read[read_number].label, &label);
                    DEBUG_MSG(3, ("SRF READ: '%s'\n", read[read_number].seq.data));
                }
                if( rc != 0 ) {
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying read", NULL);
                }
            } else {
                for(i = 0; rc == 0 && i < fe->region.nreads; i++) {
                    int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]];
                    size_t len = (i + 1 >= fe->region.nreads ? ztr.sequence->datasize : fe->region.start[i + 1]) - fe->region.start[i];
                    rc = pstring_assign(&read[read_number].seq, &ztr.sequence->data[fe->region.start[i]], len);
                    read[read_number].cs_key = fe->region.cs_key[i];
                    SRF_set_read_filter(&read[read_number].filter, flags);
                    if( fe->region.label[i].len != 0 ) {
                        rc = pstring_copy(&read[read_number].label, &fe->region.label[i]);
                    }
                    DEBUG_MSG(3, ("SRF READ[%u]: '%s'\n", i, read[read_number].seq.data));
                }
                if( rc != 0 ) {
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying reads", NULL);
                }
            }
            break;
        case CNF1:
            if(ztr.quality1->datatype != i8) {
                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "quality: expected 8-bit datatype", NULL);
            } else if( read_type > eAbisolidReadType_SPOT ) {
                int read_number = AbisolidReadType2ReadNumber[read_type];
                if( (rc = pstring_assign(&read[read_number].qual, ztr.quality1->data, ztr.quality1->datasize)) == 0 ) {
                    DEBUG_MSG(3, ("SRF QUAL: %u bytes\n", read[read_number].qual.len));
                }
                if( rc != 0 ) {
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying quality", NULL);
                }
            } else {
                for(i = 0; rc == 0 && i < fe->region.nreads; i++) {
                    int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]];
                    size_t len = (i + 1 >= fe->region.nreads ? ztr.quality1->datasize : fe->region.start[i + 1]) - fe->region.start[i];
                    rc = pstring_assign(&read[read_number].qual, &ztr.quality1->data[fe->region.start[i]], len);
                    DEBUG_MSG(3, ("SRF QUAL[%u]: %u bytes\n", i, read[read_number].qual.len));
                }
                if( rc != 0 ) {
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying qualities", NULL);
                }
            }
            break;
        case SAMP:
            if( !fe->skip_signal ) {
                size_t i;
                int stype = ABSOLID_FMT_COLMASK_NOTSET;
                if(ztr.signal->datatype != f32) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: expected 32-bit float datatype", NULL);
                } else if( (ztr.signal->datasize % sizeof(float)) != 0 ) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInvalid);
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: size not 32-bit float aligned", NULL);
                } else if (ztr.signal->channel == NULL) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcIncomplete);
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: missing channel type", NULL);
                } else if(strcmp(ztr.signal->channel, "0FAM") == 0) {
                    stype = ABSOLID_FMT_COLMASK_FAM;
                } else if(strcmp(ztr.signal->channel, "1CY3") == 0) {
                    stype = ABSOLID_FMT_COLMASK_CY3;
                } else if(strcmp(ztr.signal->channel, "2TXR") == 0) {
                    stype = ABSOLID_FMT_COLMASK_TXR;
                } else if(strcmp(ztr.signal->channel, "3CY5") == 0) {
                    stype = ABSOLID_FMT_COLMASK_CY5;
                } else {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: unexpected channel type", NULL);
                }
#if __BYTE_ORDER == __LITTLE_ENDIAN
                for(i = 0; rc == 0 && i < ztr.signal->datasize; i += 4) {
                    uint32_t* r = (uint32_t*)&ztr.signal->data[i];
                    *r = bswap_32(*r);
                }
#endif
                if( rc == 0 ) {
                    if( read_type > eAbisolidReadType_SPOT ) {
                        int read_number = AbisolidReadType2ReadNumber[read_type];
                        pstring* d = NULL;
                        switch(stype) {
                            case ABSOLID_FMT_COLMASK_FAM:
                                read[read_number].fs_type = eAbisolidFSignalType_FAM;
                                d = &read[read_number].fxx;
                                break;
                            case ABSOLID_FMT_COLMASK_CY3:
                                d = &read[read_number].cy3;
                               break;
                            case ABSOLID_FMT_COLMASK_TXR:
                                d = &read[read_number].txr;
                                break;
                            case ABSOLID_FMT_COLMASK_CY5:
                                d = &read[read_number].cy5;
                                break;
                        }
                        if( d ) {
                            rc = pstring_assign(d, ztr.signal->data, ztr.signal->datasize);
                            DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len));
                        } else {
                            rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized);
                        }
                        if( rc != 0 ) {
                            SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signal", NULL);
                        }
                    } else {
			 if( fe->region.nreads <= 0 || fe->region.nreads > ABSOLID_FMT_MAX_NUM_READS ) {
				rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported);
				SRALoaderFile_LOG(fe->ctx.file, klogErr, rc, "read count $(c)", PLOG_U8(c), fe->region.nreads);
			 }

                        for(i = 0; rc == 0 && i < fe->region.nreads; i++) {
                            pstring* d = NULL;
                            int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]];
                            size_t len = (i + 1 >= fe->region.nreads) ? ztr.signal->datasize : (fe->region.start[i + 1] * sizeof(float));
                            len -= fe->region.start[i] * sizeof(float);
                            switch(stype) {
                                case ABSOLID_FMT_COLMASK_FAM:
                                    read[read_number].fs_type = eAbisolidFSignalType_FAM;
                                    d = &read[read_number].fxx;
                                    break;
                                case ABSOLID_FMT_COLMASK_CY3:
                                    d = &read[read_number].cy3;
                                   break;
                                case ABSOLID_FMT_COLMASK_TXR:
                                    d = &read[read_number].txr;
                                    break;
                                case ABSOLID_FMT_COLMASK_CY5:
                                    d = &read[read_number].cy5;
                                    break;
                            }
                            if( d ) {
                                rc = pstring_assign(d, &ztr.signal->data[fe->region.start[i] * sizeof(float)], len);
                                DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len));
                            } else {
                                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized);
                            }
                        }
                        if( rc != 0 ) {
                            SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signals", NULL);
                        }
                    }
                }
            }
            break;
        default:
            break;
        }
        if(type != none && type != ignore) {
            free(*(void **)&ztr);
        }
    }
    if(rc == 0) {
        if( read_type <= eAbisolidReadType_SPOT ) {
            rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], &read[1]);
        } else {
            switch( AbisolidReadType2ReadNumber[read_type] ) {
                case 0:
                    rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], NULL);
                    break;
                case 1:
                    rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, NULL, &read[1]);
                    break;
                default:
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported);
                    SRALoaderFile_LOG(ctx->file, klogErr, rc, "more than 2 reads", NULL);
                    break;
            }
        }
    }
    return rc;
}
Ejemplo n.º 12
0
static
rc_t FastqLoaderFmt_WriteData(FastqLoaderFmt* self, uint32_t argc, const SRALoaderFile* const argv[], int64_t* spots_bad_count)
{
    rc_t rc = 0;
    uint32_t i, g = 0;
    FastqFileInfo* files = NULL;
    bool done;
    static IlluminaSpot spot;
 
    if( (files = calloc(argc, sizeof(*files))) == NULL ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcInsufficient);
    }

    for(i = 0; rc == 0 && i < argc; i++) {
        ExperimentQualityType qType;
        FastqFileInfo* file = &files[i];

        file->file = argv[i];
        FileReadData_init(file->spot, false);
        FileReadData_init(&file->spot[1], false);
        if( (rc = SRALoaderFile_QualityScoringSystem(file->file, &qType)) == 0 &&
            (rc = SRALoaderFile_QualityEncoding(file->file, &file->qualEnc)) == 0 &&
            (rc = SRALoaderFile_AsciiOffset(file->file, &file->qualOffset)) == 0 ) {

            file->qualType = ILLUMINAWRITER_COLMASK_NOTSET;

            if( qType == eExperimentQualityType_Undefined ) {
                qType = self->processing->quality_type;
                file->qualOffset = self->processing->quality_offset;
            }
            switch(qType) {
                case eExperimentQualityType_LogOdds:
                case eExperimentQualityType_Other:
                    if( self->w454 != NULL || self->wIonTorrent != NULL ) {
                        rc = RC(rcSRA, rcFormatter, rcConstructing, rcParam, rcInvalid);
                        LOGERR(klogInt, rc, "quality type other than Phred is not supported for this PLATFORM");
                    }
                    file->qualMin = -40;
                    file->qualMax = 41;
                    file->qualType = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS1;
                    break;
                default:
                    SRALoaderFile_LOG(file->file, klogWarn, rc, 
                        "quality_scoring_system attribute not set for this file, using Phred as default", NULL);
                case eExperimentQualityType_Phred:
                    file->qualType = ILLUMINAWRITER_COLMASK_QUALITY_PHRED;
                    file->qualMin = 0;
                    file->qualMax = (self->wIllumina) ? 61: 127;
                    break;
            }
        }
    }
    do {
        done = true;
        for(i = 0; rc == 0 && i < argc; i++) {
            FastqFileInfo* file = &files[i];
            if( (rc = read_next_spot(self, file)) != 0 || !file->spot->ready ) {
                continue;
            }
            done = false;
#if _DEBUGGING
            {{
                FileReadData* ss = file->spot;
                do {
                    DEBUG_MSG(3, ("file-%u: name:'%s', bc:%s, rd:%i, flt:%hu, seq '%.*s', qual %u bytes\n",
                                  i + 1, ss->name.data, ss->barcode.data, ss->read.read_id, ss->read.filter,
                                  ss->read.seq.len, ss->read.seq.data, ss->read.qual.len));
                    if( ss == &file->spot[1]){ break; }
                    ss = file->spot[1].ready ? &file->spot[1] : NULL;
                } while( ss != NULL );
            }}
#endif
        }
        if( rc != 0 || done ) {
            break;
        }
        /* collect spot reads, matching by spot name
         * spot data may be split across multiple files
         */
        IlluminaSpot_Init(&spot);
        for(i = 0; rc == 0 && i < argc; i++) {
            FileReadData* fspot = files[i].spot[0].ready ? &files[i].spot[0] : NULL;
            while(rc == 0 && fspot != NULL ) {
                rc = IlluminaSpot_Add(&spot, &fspot->name, &fspot->barcode, &fspot->read);
                if( rc == 0 ) {
                    g = i;
                    fspot->ready = false;
                } else if( GetRCState(rc) == rcIgnored ) {
                    rc = 0;
                } else {
                    SRALoaderFile_LOG(files[i].file, klogErr, rc, "$(msg)", "msg=adding data to spot");
                }
                if( fspot == &files[i].spot[1]) { break; }
                fspot = files[i].spot[1].ready ? &files[i].spot[1] : NULL;
            }
        }
        if( rc == 0 ) {
            if( self->wIllumina != NULL ) {
                if( (rc = SRAWriterIllumina_Write(self->wIllumina, argv[0], &spot)) != 0 &&
                    GetRCTarget(rc) == rcFormatter && GetRCContext(rc) == rcValidating ) {
                    SRALoaderFile_LOG(files[g].file, klogWarn, rc, "$(msg) $(spot_name)", "msg=bad spot,spot_name=%.*s",
                                                spot.name->len, spot.name->data);
                    self->spots_bad_count++;
                    if( self->spots_bad_allowed < 0 ||
                        self->spots_bad_count <= self->spots_bad_allowed ) {
                        rc = 0;
                    }
                }
            } else if( spot.nreads != 1 ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnsupported);
                SRALoaderFile_LOG(files[g].file, klogErr, rc, "$(msg)", "msg=multiple reads for this platform");
            } else if( self->wIonTorrent != NULL ) {
                rc = SRAWriterIonTorrent_WriteRead(self->wIonTorrent, argv[0], spot.name,
                                                   spot.reads[0].seq, spot.reads[0].qual, NULL, NULL, 0, 0, 0, 0);
            } else {
                rc = SRAWriter454_WriteRead(self->w454, argv[0], spot.name,
                                            spot.reads[0].seq, spot.reads[0].qual, NULL, NULL, 0, 0, 0, 0);
            }
        }
    } while( rc == 0 );
    free(files);
    *spots_bad_count = self->spots_bad_count;
    return rc;
}
Ejemplo n.º 13
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file)
{
    rc_t rc = 0;

    if( file->spot->ready ) {
        /* data still not used */
        return 0;
    }
    FileReadData_init(file->spot, false);
    FileReadData_init(&file->spot[1], false);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) {
        /* single line forms */
        file->line = NULL; /* line consumed */
        file->spot->ready = true;
    } else  if( file->line[0] == '>' || file->line[0] == '@' ) {
        /* 4 or 8 line format */
        FileReadData sd;
        uint8_t word = 0, best_word = 0;
        uint8_t score = 0, best_score = 0;
        /* find and parse spot name on defline */
        do {
            score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word);
            if( score > best_score ) {
                if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 ||
                    (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name");
                }
                file->spot->read.read_id = sd.read.read_id;
                best_score = score;
                best_word = word; /* used below for quality defline parsing */
            }

        } while(score != 0);
        if( best_score == 0 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected");
        }
        if( file->line[0] == '@' ) {
            if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) {
                return rc;
            }
            /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */
            if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) {
                /* try to find read id on next line */
                FileReadData_init(&file->spot[1], false);
                if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) {
                    if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 &&
                        pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 &&
                        file->spot->read.read_id != file->spot[1].read.read_id ) {
                        /* since it is different read id with same name and barcode, fill up second read */
                        if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) {
                            return rc;
                        }
                    }
                }
            }
        } else {
            /* 2 line seq or quality form */
            file->line = NULL; /* line consumed */
            /* read sequence/quality */
            if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data");
            }
            if( file->spot->read.seq.len == 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data");
            } else if( !pstring_is_fasta(&file->spot->read.seq) ) {
                /* swap */
                if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) {
                    file->spot->read.qual_type = file->qualType;
                    pstring_clear(&file->spot->read.seq);
                }
            }
            file->spot->ready = true;
        }
    } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown");
    }
    if( rc == 0 ) {
        int k;
        for(k = 0; k < 2; k++) {
            FileReadData* rd = &file->spot[k];
            if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
                if( file->qualOffset == 0 ) {
                    /* detect and remember */
                    file->qualOffset = 33;
		    file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    if( GetRCState(rc) == rcOutofrange ) {
                        file->qualOffset = 64;
			file->qualMax = 61;
                        rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    }
                } else {
		    if(file->qualOffset == 33) file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality");
                }
            }
        }
    }
    return 0;
}
Ejemplo n.º 14
0
/* parses name as a given word number (1-based) in a str of size len
 * looks for name(#barcode)?([\/.]\d)?
 * returns score of found parts
 * score == 0 word not found
 */ 
static
uint8_t parse_spot_name(const SRALoaderFile* file, FileReadData* spot, const char* str, size_t len, uint8_t word_number)
{
    uint8_t w, score = 0;
    const char* name, *name_end;

    name = name_end = str;
    /* set name_end to end of word_number-th word */
    for(w = 1; w <= word_number || name_end == NULL; w++ ) {
        /* skip consecutive spaces */
        while( *name_end == ' ' && name_end != &str[len] ) {
            name_end++;
        }
        name = name_end;
        name_end = memchr(name, ' ', len - (name_end - str));
        if( name_end == NULL ) {
            if( w == word_number ) {
                name_end = &str[len];
            }
            break;
        }
    }
    if( name != name_end && name_end != NULL ) {
        char* x;
        rc_t rc;

        /* init only name portion */
        FileReadData_init(spot, true);
        --name_end; /* goto last char */
        if( isdigit(name_end[0])&& (name_end[-1] == '\\' || name_end[-1] == '/' )) {
            score++;
            spot->read.read_id = name_end[0] - '0';
            name_end -= 2;
        } else if( isdigit(*name_end) && name_end[-1] == '.' ) {
            int q = 0;
            if( memrchr(name, '#', name_end - name) != NULL ) {
                /* have barode -> this is read id */
                q = 4;
            } else {
                /* may a read id, check to see if 4 coords follow */
                const char* end = name_end - 1;
                while( --end >= name ) {
                    if( strchr(":|_", *end) != NULL ) {
                        q++;
                    } else if( !isdigit(*end) ) {
                        break;
                    }
                }
            }
            if( q == 4 ) {
                score++;
                spot->read.read_id = name_end[0] - '0';
                name_end -= 2;
            }
        }
        if( (x = memrchr(name, '#', name_end - name)) != NULL ) {
            score++;
            if( (rc = pstring_assign(&spot->barcode, x + 1, name_end - x)) != 0 ) {
                SRALoaderFile_LOG(file, klogErr, rc, "barcode $(b)", "b=%.*s", name_end - x, x + 1);
                return 0;
            }
            if( pstring_strcmp(&spot->barcode, "0") == 0 ) {
                pstring_clear(&spot->barcode);
            } else if( spot->barcode.len >= 4 &&
                       (strncmp(spot->barcode.data, "0/1_", 4) == 0 || strncmp(spot->barcode.data, "0/2_", 4) == 0) ) {
                spot->read.read_id = spot->barcode.data[2] - '0';
                pstring_assign(&spot->barcode, &spot->barcode.data[4], spot->barcode.len - 4);
            }
            name_end = --x;
        }
        score++;
        if( (rc = pstring_assign(&spot->name, name, name_end - name + 1)) != 0 ) {
            SRALoaderFile_LOG(file, klogErr, rc, "spot name $(n)", "n=%.*s", name_end - name + 1, name);
            return 0;
        }
        /* search for _R\d\D in name and use it as read id, remove from name or spot won't assemble */
        x = spot->name.data;
        while( (x = strrchr(x, 'R')) != NULL ) {
            if( x != spot->name.data && *(x - 1) == '_' && isdigit(*(x + 1)) && !isalnum(*(x + 2)) ) {
                score++;
		if(spot->read.read_id == -1){
			spot->read.read_id = *(x + 1) - '0';
		}
                strcpy(x - 1, x + 2);
                spot->name.len -= 4;
                break;
            }
            x++;
        }
        /* find last '=' and use only whatever is to the left of it */
        if( (x = memrchr(spot->name.data, '=', spot->name.len)) != NULL ) {
            rc = pstring_assign(&spot->name, spot->name.data, (x - spot->name.data) );
        }
    }
    return score;
}
Ejemplo n.º 15
0
bool FGroup_Parse( SLNode *n, void *d )
{
    FGroup_Parse_data* data = (FGroup_Parse_data*)d;
    FGroup* g = (FGroup*)n;
    bool done;
    const SRALoaderFile* data_block_ref = NULL;

    data->rc = 0;
    do {
        IlluminaFileInfo* file = g->files;
        done = true;
        while( data->rc == 0 && file != NULL ) {
            if( (data->rc = read_next_spot(g->blk_pfx, file)) == 0 && file->ready ) {
                done = false;
            }
            file = file->next;
        }
        if( data->rc != 0 || done ) {
            break;
        }
        /* collect spot reads, matching by spot name
         * spot data may be split across multiple files
         */
        IlluminaSpot_Init(&data->spot);
        file = g->files;
        while( data->rc == 0 && file != NULL ) {
            if( file->ready ) {
                if( (file->type == eIlluminaNativeFileTypeNoise && data->self->skip_noise) ||
                    (file->type == eIlluminaNativeFileTypeIntensity && data->self->skip_intensity) ||
                    (file->type == eIlluminaNativeFileTypeSignal && data->self->skip_signal) ) {
                    file->ready = false;
                } else {
                    data_block_ref = file->file;
                    if( file->type == eIlluminaNativeFileTypeQSeq && (g->mask & eIlluminaNativeFileTypeQuality4) ) {
                        /* drop quality1 from qseq data */
                        pstring_clear(&file->read.qual);
                    } else if( file->type == eIlluminaNativeFileTypeQuality4 ) {
                        IlluminaFileInfo* neib = file->next ? file->next : file->prev;
                        /* need to fix spotname to be same cause prb do not have any name in it */
                        if( (data->rc = pstring_copy(&file->name, &neib->name)) != 0 ) {
                            SRALoaderFile_LOG(file->file, klogErr, data->rc, "$(msg) '$(n)'", "msg=syncing prb spot name,n=%s", neib->name.data);
                        }
                    }
                    if( data->rc == 0 ) {
                        data->rc = IlluminaSpot_Add(&data->spot, &file->name, &file->barcode, &file->read);
                        if( data->rc == 0 ) {
                            file->ready = false;
                        } else {
                            if( GetRCState(data->rc) == rcIgnored ) {
                                SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(s1)' <> '$(s2)'",
                                                "msg=spot name mismatch,s1=%.*s,s2=%.*s",
                                                data->spot.name->len, data->spot.name->data, file->name.len, file->name.data);
                                data->self->spots_bad_count++;
                                /* skip spot for all files in a group */
                                file = g->files;
                                while( file != NULL ) {
                                    file->ready = false;
                                    SRALoaderFile_LOG(file->file, klogWarn, data->rc,
                                                      "$(msg) '$(n)'", "msg=skipped spot,n=%s", file->name.data);
                                    file = file->next;
                                }
                                if( data->self->spots_bad_allowed >= 0 &&
                                    data->self->spots_bad_count > data->self->spots_bad_allowed ) {
                                    data->rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
                                }
                                break;
                            }
                        }
                    }
                }
            }
            file = file->next;
        }
        if( GetRCState(data->rc) == rcIgnored ) {
            data->rc = 0;
            continue;
        }
        if( data->rc == 0 ) {
            data->rc = SRAWriterIllumina_Write(data->self->writer, data_block_ref, &data->spot);
        }
    } while( data->rc == 0 );
    return data->rc != 0;
}
Ejemplo n.º 16
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file)
{
    rc_t rc = 0;
    const char* tail = file->line;

    if( file->ready ) {
        /* data still not used */
        return 0;
    }
    IlluminaFileInfo_init(file);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    switch( file->type ) {
        case eIlluminaNativeFileTypeQSeq:
            if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq");
            }
            break;

        case eIlluminaNativeFileTypeFasta:
        case eIlluminaNativeFileTypeNoise:
        case eIlluminaNativeFileTypeIntensity:
        case eIlluminaNativeFileTypeSignal:
            {{
                /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */
                if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) {
                    if( blk_pfx != NULL ) {
                        pstring tmp_name;
                        if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 &&
                            (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 &&
                            (rc = pstring_append(&file->name, ":", 1)) == 0 ) {
                            rc = pstring_concat(&file->name, &tmp_name);
                        }
                    }
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord");
                }
                break;
            }}

        case eIlluminaNativeFileTypeQuality4:
            if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality");
            } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4");
            }
            break;

        default:
            rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line");
            break;
    }

    /* process tail (after coords) for some file types */
    file->line_len -= tail - file->line; /* length of tail */
    switch( file->type ) {
        case eIlluminaNativeFileTypeQSeq:
        case eIlluminaNativeFileTypeQuality4:
        default:
            /* completely processed before */
            break;

        case eIlluminaNativeFileTypeFasta:
            if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 ||
                !pstring_is_fasta(&file->read.seq) ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta");
            }
            break;

        case eIlluminaNativeFileTypeNoise:
            if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise");
            }
            break;

        case eIlluminaNativeFileTypeIntensity:
            if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity");
            }
            break;

        case eIlluminaNativeFileTypeSignal:
            if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal");
            }
            break;
    }
    file->line = NULL;
    file->ready = true;
#if _DEBUGGING
    DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, 
                file->coord[0], file->coord[1], file->coord[2], file->coord[3]));
    if( file->read.seq.len ) {
        DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data));
    }
    if( file->read.qual.len ) {
        DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len));
    }
#endif
    return 0;
}
Ejemplo n.º 17
0
static
rc_t SFFLoaderFmtReadCommonHeader(SFFLoaderFmt* self, const SRALoaderFile *file)
{
    rc_t rc = 0;
    bool skiped_idx_pad = false;
    uint16_t head_sz;
    SFFCommonHeader prev_head;
    pstring prev_flow_chars;
    pstring prev_key_seq;

    if( (rc = SRALoaderFile_Offset(file, &self->index_correction)) != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "Reading initial file position", NULL);
        return rc;
    }
SkipIndexPad:
    self->index_correction += self->file_advance;
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFCommonHeader_size, NULL, true)) != 0) {
        SRALoaderFile_LOG(file, klogErr, rc, "common header, needed $(needed) bytes",
                          PLOG_U32(needed), SFFCommonHeader_size);
        return rc;
    }
    if( self->header.magic_number != 0 ) {
        /* next file in stream, remember prev to sync to each */
        memcpy(&prev_head, &self->header, sizeof(SFFCommonHeader));
        pstring_copy(&prev_flow_chars, &self->flow_chars);
        pstring_copy(&prev_key_seq, &self->key_seq);
    } else {
        prev_head.magic_number = 0;
        prev_head.index_length = 0;
    }

    memcpy(&self->header, self->file_buf, SFFCommonHeader_size);
#if __BYTE_ORDER == __LITTLE_ENDIAN
    self->header.magic_number = bswap_32(self->header.magic_number);
    self->header.version = bswap_32(self->header.version);
    self->header.index_offset = bswap_64(self->header.index_offset);
    self->header.index_length = bswap_32(self->header.index_length);
    self->header.number_of_reads = bswap_32(self->header.number_of_reads);
    self->header.header_length = bswap_16(self->header.header_length);
    self->header.key_length = bswap_16(self->header.key_length);
    self->header.num_flows_per_read = bswap_16(self->header.num_flows_per_read);
#endif

    if( self->header.magic_number != (('.'<<24)|('s'<<16)|('f'<<8)|('f'<<0)) ) {
        if( !skiped_idx_pad && prev_head.magic_number != 0 ) {
            /* possible concatination of 2 files with index at EOF and padded to 8 bytes with header values not padded,
               try skipping padding and reread */
            uint32_t pad = 8 - prev_head.index_length % 8;
            if( pad != 0 ) {
                self->file_advance += pad;
                DEBUG_MSG(5, ("%s: trying to skip over %u bytes index section padding\n", self->file_name, pad));
                skiped_idx_pad = true;
                goto SkipIndexPad;
            }
        }
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized);
        SRALoaderFile_LOG(file, klogErr, rc, "magic number: $(m)", PLOG_U32(m), self->header.magic_number);
        return rc;
    }
    if( self->header.version != 1 ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcBadVersion);
        SRALoaderFile_LOG(file, klogErr, rc, "format version $(v)", PLOG_U32(v), self->header.version);
        return rc;
    }
    if( self->header.flowgram_format_code != SFFFormatCodeUI16Hundreths ) {
        /* NOTE: add a case here if flowgram coding gets new version to support different */
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported);
        SRALoaderFile_LOG(file, klogErr, rc, "common header flowgram format code", NULL);
        return rc;
    }
    if( self->header.index_length % 8 != 0 ) {
        DEBUG_MSG(5, ("%s: index_length field value is not 8 byte padded: %u\n", self->file_name, self->header.index_length));
    }
    head_sz = SFFCommonHeader_size + self->header.num_flows_per_read + self->header.key_length;
    head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0;
    if( head_sz != self->header.header_length ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid);
        SRALoaderFile_LOG(file, klogErr, rc, "header length $(h) <> $(s) ", PLOG_2(PLOG_U16(h),PLOG_U16(s)),
                          self->header.header_length, head_sz);
        return rc;
    }
    /* read flow chars and key */
    self->file_advance = SFFCommonHeader_size;
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFCommonHeader_size, "common header", false)) != 0) {
        return rc;
    }
    self->file_advance = head_sz - SFFCommonHeader_size;

    if( (rc = pstring_assign(&self->flow_chars, self->file_buf, self->header.num_flows_per_read)) != 0 ||
        (rc = pstring_assign(&self->key_seq, self->file_buf + self->header.num_flows_per_read, self->header.key_length)) != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "reading flows/key sequence", NULL);
        return rc;
    }
    if( prev_head.magic_number != 0 ) {
        /* next file's common header must match previous file's common header, partially */
        if( prev_head.key_length != self->header.key_length ||
            prev_head.num_flows_per_read != self->header.num_flows_per_read ||
            pstring_cmp(&prev_flow_chars, &self->flow_chars) != 0 ||
            pstring_cmp(&prev_key_seq, &self->key_seq) != 0 ) {
                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInconsistent);
                SRALoaderFile_LOG(file, klogErr, rc, "previous file common header differ in flows/key sequence", NULL);
        }
    }
    if( rc == 0 ) {
        if( self->w454 ) {
            rc = SRAWriter454_WriteHead(self->w454, &self->flow_chars, &self->key_seq);
        } else {
            rc = SRAWriterIonTorrent_WriteHead(self->wIonTorrent, &self->flow_chars, &self->key_seq);
        }
    }
    return rc;
}
Ejemplo n.º 18
0
static
rc_t SFFLoaderFmtReadData(SFFLoaderFmt* self, const SRALoaderFile* file)
{
    rc_t rc = 0;
    uint32_t i;

    /* calc signal chunk size */
    size_t signal_sz = self->header.num_flows_per_read * sizeof(uint16_t);
    /* plus position, read, quality */
    size_t sz = signal_sz + self->read_header.number_of_bases * 3;
    /* + padding */
    sz += (sz % 8) ? (8 - (sz % 8)) : 0;

    /* adjust the buffer window to full data block size */
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, sz, "read data", false)) != 0 ) { 
        return rc;
    }
    self->file_advance = sz;

    if( !self->skip_signal ) {
        rc = pstring_assign(&self->signal, self->file_buf, signal_sz);
#if __BYTE_ORDER == __LITTLE_ENDIAN
        if( rc == 0 ) {
            uint16_t* sig = (uint16_t*)self->signal.data;
            for(i = 0; i < self->header.num_flows_per_read; i++) {
                sig[i] = bswap_16(sig[i]);
            }
        }
#endif
    }

    if( rc == 0 ) {
        const uint8_t* pos = self->file_buf + signal_sz;

        if( !self->skip_signal ) {
            INSDC_coord_one *p;
            /* reset buffer to proper size */
            pstring_clear(&self->position);
            rc = pstring_append_chr(&self->position, 0, self->read_header.number_of_bases * sizeof(*p));
            p = (INSDC_coord_one*)&self->position.data[0];
            p[0] = pos[0];
            for(i = 1; i < self->read_header.number_of_bases; i++) {
                p[i] = p[i - 1] + pos[i];
            }
        }
        if( rc == 0 ) {
            pos += self->read_header.number_of_bases;
            rc = pstring_assign(&self->read, pos, self->read_header.number_of_bases);
            /*for(i = 0; i< self->read.len; i++ ) {
                self->read.data[i] = tolower(self->read.data[i]);
            }*/
        }
        if( rc == 0 ) {
            pos += self->read_header.number_of_bases;
            rc = pstring_assign(&self->quality, pos, self->read_header.number_of_bases);
        }
    }
    if( rc != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "copying read data", NULL);
    }
    return rc;
}
Ejemplo n.º 19
0
static
rc_t parse_read(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size)
{
    rc_t rc = 0;
    size_t parsed;
    uint8_t flags;
    pstring readId;
    ztr_raw_t ztr_raw;
    ztr_t ztr;
    enum ztr_chunk_type type;
    fe_context_t* fe = (fe_context_t*)ctx;

    *(void **)&fe->sequence =
    *(void **)&fe->quality1 =
    *(void **)&fe->quality4 =
    *(void **)&fe->signal =
    *(void **)&fe->intensity = 
    *(void **)&fe->noise = NULL;
    
    rc = SRF_ParseReadChunk(data, size, &parsed, &flags, &readId);
    if(rc) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rc);
        return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
    }
    if(fe->defered != NULL)
        ZTR_AddToBuffer(ztr_ctx, fe->defered, fe->defered_len);
    ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed);
    if(fe->defered == NULL) {
        rc = ZTR_ParseBlock(ztr_ctx, &ztr_raw);
        if(rc == 0)
            goto PARSE_BLOCK;
        rc = ZTR_ParseHeader(ztr_ctx);
        if(rc) {
            return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
        }
    }
    
    while (!ZTR_BufferIsEmpty(ztr_ctx)) {
        rc = ZTR_ParseBlock(ztr_ctx, &ztr_raw);
    PARSE_BLOCK:
        if(rc != 0 || (rc = ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0 ) {
            return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL);
        }
        
        switch (type) {
            case READ:
                if(ztr.sequence->datatype != i8) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                    return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for sequence data", NULL);
                }
                fe->sequence = ztr;
                break;
            case QUALITY1:
                if(ztr.quality1->datatype != i8) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                    return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for quality1 data", NULL);
                }
                fe->quality1 = ztr;
                break;
            case QUALITY4:
                if(ztr.quality4->datatype != i8) {
                    rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected);
                    return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for quality4 data", NULL);
                }
                fe->quality4 = ztr;
                break;
            case SIGNAL4:
                if(ztr.signal4->Type != NULL && strncmp(ztr.signal4->Type, "SLXI", 4) == 0 ) {
                    if( !fe->skip_intensity ) {
                        fe->intensity = ztr;
                    } else if(ztr.signal4){
			if(ztr.signal4->data) free(ztr.signal4->data);
			free(ztr.signal4);
		    }
                } else if(ztr.signal4->Type != NULL && strncmp(ztr.signal4->Type, "SLXN", 4) == 0 ) {
                    if( !fe->skip_noise ) {
                        fe->noise = ztr;
                    } else if(ztr.signal4){
			if(ztr.signal4->data) free(ztr.signal4->data);
			free(ztr.signal4);
                    }
                } else if( !fe->skip_signal ) {
                    fe->signal = ztr;
		} else if(ztr.signal4){
			if(ztr.signal4->data) free(ztr.signal4->data);
			free(ztr.signal4);
                }
                break;

            default:
                free(*(void **)&ztr);

            case none:
            case ignore:
                if(ztr_raw.data) {
                    free(ztr_raw.data);
                }
                break;
        }
	if(ztr_raw.meta){
		free(ztr_raw.meta);
		ztr_raw.meta=NULL;
	}
    }
    
    while(rc == 0) {
        if(*(void **)&fe->sequence == NULL) {
            rc = RC(rcSRA, rcFormatter, rcParsing, rcConstraint, rcViolated);
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing sequence data", NULL);
            break;
        }
        if(*(void **)&fe->quality4 == NULL && *(void **)&fe->quality1 == NULL) {
            rc = RC(rcSRA, rcFormatter, rcParsing, rcConstraint, rcViolated);
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing quality data", NULL);
            break;
        }

        if( (rc = ILL_ZTR_Decompress(ztr_ctx, BASE, fe->sequence, fe->sequence)) != 0 ||
            (rc = pstring_assign(&fe->read.seq, fe->sequence.sequence->data, fe->sequence.sequence->datasize)) != 0 ) {
            SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress sequence data", NULL);
            break;
        }
        
        if( *(void **)&fe->quality4 != NULL ) {
            if( (rc = ILL_ZTR_Decompress(ztr_ctx, CNF4, fe->quality4, fe->sequence)) != 0 ||
                (rc = pstring_assign(&fe->read.qual, fe->quality4.quality4->data, fe->quality4.quality4->datasize)) != 0 ) {
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress quality4 data", NULL);
                break;
            }
            fe->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS4;
        } else if( *(void **)&fe->quality1 != NULL ) {
            if( (rc = ILL_ZTR_Decompress(ztr_ctx, CNF1, fe->quality1, fe->sequence)) != 0 ||
                (rc = pstring_assign(&fe->read.qual, fe->quality1.quality1->data, fe->quality1.quality4->datasize)) != 0 ) {
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress quality1 data", NULL);
                break;
            }
            fe->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED;
        }
        if( *(void **)&fe->signal != NULL ) {
            if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->signal, fe->sequence)) != 0 ||
                (rc = pstring_assign(&fe->read.signal, fe->signal.signal4->data, fe->signal.signal4->datasize)) != 0 ) {
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress signal data", NULL);
                break;
            }
        }
        if( *(void **)&fe->intensity != NULL ) {
            if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->intensity, fe->sequence)) != 0 ||
                (rc = pstring_assign(&fe->read.intensity, fe->intensity.signal4->data, fe->intensity.signal4->datasize)) != 0 ) {
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress intensity data", NULL);
                break;
            }
        }
        if( *(void **)&fe->noise != NULL ) {
            if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->noise, fe->sequence)) != 0 ||
                (rc = pstring_assign(&fe->read.noise, fe->noise.signal4->data, fe->noise.signal4->datasize)) != 0 ) {
                SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress noise data", NULL);
                break;
            }
        }
        rc = fe_new_read(fe, flags, &readId);
        break;
    }
    if(fe->sequence.sequence) {
        if(fe->sequence.sequence->data)
            free(fe->sequence.sequence->data);
        free(fe->sequence.sequence);
    }
    if(fe->quality1.quality1) {
        if(fe->quality1.quality1->data)
            free(fe->quality1.quality1->data);
        free(fe->quality1.quality1);
    }
    if(fe->quality4.quality4) {
        if(fe->quality4.quality4->data)
            free(fe->quality4.quality4->data);
        free(fe->quality4.quality4);
    }
    if(fe->signal.signal4) {
        if(fe->signal.signal4->data)
            free(fe->signal.signal4->data);
        free(fe->signal.signal4);
    }
    if(fe->intensity.signal4) {
        if(fe->intensity.signal4->data)
            free(fe->intensity.signal4->data);
        free(fe->intensity.signal4);
    }
    if(fe->noise.signal4) {
        if(fe->noise.signal4->data)
            free(fe->noise.signal4->data);
        free(fe->noise.signal4);
    }
    return rc;
}
Ejemplo n.º 20
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file)
{
    rc_t rc = 0;

    if( file->ready ) {
        /* data still not used */
        return 0;
    }
    HelicosFileInfo_init(file);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( file->line[0] == '@' ) { /*** fastq format **/
        if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 ||
            file->line[0] != '+' || file->line_len != 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 ||
            (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 ||
            (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality");
        }
        file->line = NULL;
        file->ready = true;
    } else if( file->line[0] == '>' ) { /** fasta format **/
	if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
	if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
	file->line = NULL;
	file->quality.len = file->sequence.len;
	memset(file->quality.data,14,file->quality.len);
	file->ready = true;
    } else {
        rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'");
    }
#if _DEBUGGING
 DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len)); /*
    DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/
#endif
    return 0;
}
Ejemplo n.º 21
0
static
rc_t IlluminaLoaderFmt_WriteData(IlluminaLoaderFmt* self, uint32_t argc, const SRALoaderFile* const argv[], int64_t* spots_bad_count)
{
    rc_t rc = 0;
    uint32_t t, i, k, ftype_q = sizeof(file_types) / sizeof(file_types[0]);
    SLList files;
    IlluminaFileInfo* file = NULL;

    SLListInit(&files);

    /* group files using spotname, for _prb. file name prefix is used,
       files reviewed by type detected from name and ordered by file_type array */
    for(t = 0; rc == 0 && t < ftype_q; t++) {
        for(i = 0; rc == 0 && i < argc; i++) {
            const char* fname, *blk_pfx;
            int prefix_len = 0;
            ERunFileType ftype;
            EIlluminaNativeFileType type = eIlluminaNativeFileTypeNotSet;
            FGroup_Find_data data;

            if( (rc = SRALoaderFileName(argv[i], &fname)) != 0 ) {
                SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file name", NULL);
                break;
            }
            if( (rc = SRALoaderFile_FileType(argv[i], &ftype)) != 0 ) {
                SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file type", NULL);
                break;
            }
            if( (rc = SRALoaderFileBlockName(argv[i], &blk_pfx)) != 0 ) {
                SRALoaderFile_LOG(argv[i], klogErr, rc, "reading DATA_BLOCK/@name", NULL);
                break;
            }
            if( blk_pfx == NULL ) {
                blk_pfx = "";
            }
            {{
                /* skip path if present */
                const char* p = strrchr(fname, '/');
                fname = p ? p + 1 : fname;
                p = NULL;
                for(k = 0; type == eIlluminaNativeFileTypeNotSet && k < ftype_q; k++) {
                    const char* const* e = file_types[k].key;
                    while( *e != NULL ) {
                        p = strstr(fname, *e++);
                        if( p != NULL ) {
                            type = file_types[k].type;
                            break;
                        } 
                    }
                }
                if( p != NULL ) {
                    prefix_len = p - fname;
                }
            }}
            if( ftype == rft_IlluminaNativeSeq ) {
                type = eIlluminaNativeFileTypeFasta;
            } else if( ftype == rft_IlluminaNativePrb ) {
                type = eIlluminaNativeFileTypeQuality4;
            } else if( ftype == rft_IlluminaNativeInt ) {
                type = eIlluminaNativeFileTypeIntensity;
            } else if( ftype == rft_IlluminaNativeQseq ) {
                type = eIlluminaNativeFileTypeQSeq;
            }
            if( type == eIlluminaNativeFileTypeNotSet ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
                SRALoaderFile_LOG(argv[i], klogErr, rc, "detecting file type by file name", NULL);
                break;
            }
            if( type != file_types[t].type ) {
                /* one type at a time */
                continue;
            }
            DEBUG_MSG(3, ("file '%s' type set to %d\n", fname, type));
            file = calloc(1, sizeof(*file));
            if( file == NULL ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcExhausted);
                SRALoaderFile_LOG(argv[i], klogErr, rc, "allocating file object", NULL);
                break;
            }
            IlluminaFileInfo_init(file);
            file->file = argv[i];
            file->type = type;

            if( file->type == eIlluminaNativeFileTypeQuality4 ) {
                /* in _prb there is no spotname inside so use file prefix */
                rc = pstring_assign(&data.key, fname, prefix_len);
            } else {
                /* try to get 1st spot so group can be organized by spot name */
                if( (rc = read_next_spot(blk_pfx, file)) != 0 || !file->ready ) {
                    rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcNotFound);
                    SRALoaderFile_LOG(argv[i], klogErr, rc, "reading 1st spot", NULL);
                    break;
                }
                rc = pstring_copy(&data.key, &file->name);
            }

            data.found = NULL;
            if( SLListDoUntil(&files, FGroup_Find, &data) && data.found != NULL ) {
                IlluminaFileInfo* ss = data.found->files;

                while( rc == 0 && file != NULL ) {
                    if( ss->type != eIlluminaNativeFileTypeQSeq && ss->type == file->type ) {
                        rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcDuplicate);
                        SRALoaderFile_LOG(argv[i], klogErr, rc, "type of file for lane", NULL);
                    } else if( ss->next != NULL ) {
                        ss = ss->next;
                    } else {
                        ss->next = file;
                        file->prev = ss;
                        data.found->mask |= file->type;
                        file = NULL;
                    }
                }
            } else {
                data.found = calloc(1, sizeof(*data.found));
                if( data.found == NULL ) {
                    rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcInsufficient);
                    SRALoaderFile_LOG(argv[i], klogErr, rc, "preparing file group", NULL);
                    break;
                } else {
                    if( (rc = pstring_assign(&data.found->key, fname, prefix_len)) != 0 ) {
                        SRALoaderFile_LOG(argv[i], klogErr, rc, "setting file group key", NULL);
                        FGroup_Whack(&data.found->dad, NULL);
                        break;
                    } else {
                        FGroup* curr = (FGroup*)SLListHead(&files), *prev = NULL;
                        data.found->blk_pfx = blk_pfx;
                        data.found->files = file;
                        data.found->mask = file->type;
                        /* group inserted into list by coords in 1st spot */
                        while( curr != NULL ) {
                            if( curr->files[0].coord[0] > file->coord[0] ||
                                (curr->files[0].coord[0] == file->coord[0] &&
                                 curr->files[0].coord[1] > file->coord[1]) ) {
                                data.found->dad.next = &curr->dad;
                                if( prev == NULL ) {
                                    files.head = &data.found->dad;
                                } else {
                                    prev->dad.next = &data.found->dad;
                                }
                                break;
                            }
                            prev = curr;
                            curr = (FGroup*)curr->dad.next;
                        }
                        if( curr == NULL ) {
                            SLListPushTail(&files, &data.found->dad);
                        }
                        file = NULL;
                    }
                }
            }
        }
    }
    if( rc == 0 ) {
        SLListForEach(&files, FGroup_Validate, &rc);
    }
    if( rc == 0 ) {
        FGroup_Parse_data data;
        data.self = self;
        if( SLListDoUntil(&files, FGroup_Parse, &data) ) {
            rc = data.rc;
        }
    } else {
        free(file);
    }
    SLListWhack(&files, FGroup_Whack, NULL);
    *spots_bad_count = self->spots_bad_count;
    return rc;
}