Exemple #1
0
static
rc_t read_spot_data_3lines(FastqFileInfo* file, FileReadData* sd, uint8_t best_word, uint8_t best_score, int qualType)
{
    rc_t rc = 0;

    file->line = NULL; /* discard defline */
    /* read sequence */
    if( (rc = read_multiline_seq_or_qual(file, '+', &sd->read.seq)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence data");
    }
    if( !pstring_is_fasta(&sd->read.seq) ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected sequence data");
    }
    /* next defline */
    if( (rc = file_read_line(file, false)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
    }
    if( file->line[0] != '+' ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '+' on quality defline");
    }
    if( file->line_len != 1 ) { /* there may be just '+' on quality defline */
        FileReadData d;
        uint8_t score = parse_spot_name(file->file, &d, &file->line[1], file->line_len - 1, best_word);
        /* sometimes quality defline may NOT contain barcode and readid, so score will be lower than bestscore,
           but must be at least == 1 with none empty line, which means that name was found */
        if( score < 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not found");
        }
        if( pstring_cmp(&sd->name, &d.name) != 0 ||
            (score == best_score && (pstring_cmp(&sd->barcode, &d.barcode) != 0 || sd->read.read_id != d.read.read_id)) ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality defline do not match sequence defline");
        }
    }
    file->line = NULL; /* discard defline */
    if( (rc = read_multiline_seq_or_qual(file, '@', &sd->read.qual)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=failed to read quality");
    }
    if( sd->read.qual.len <= 0 ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcEmpty);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality");
    }
    sd->read.qual_type = qualType;
    sd->ready = true;
    return 0;
}
Exemple #2
0
rc_t IlluminaSpot_Add(IlluminaSpot* spot, const pstring* name, const pstring* barcode, const IlluminaRead* read)
{
    rc_t rc = 0;

    if( spot->nreads == 0 ) {
        rc = IlluminaSpot_Set(spot, spot->nreads++, name, barcode, read);
    } else if( pstring_cmp(spot->name, name) == 0 ) {
        /* look if same read_id was already seen in this spot */
        int32_t k;
        for(k = 0; k < spot->nreads; k++) {
            if( spot->reads[k].read_id == read->read_id ) {
                const char* field;
                rc = IlluminaSpot_Append(spot, k, barcode, read, &field);
                if( GetRCState(rc) == rcDuplicate && read->read_id == ILLUMINAWRITER_READID_NONE ) {
                    /* may be it is the case when readids are missing on defline and these are separate reads */
                    k = spot->nreads + 1;
                    rc = 0;
                } else if( rc != 0 ) {
                    PLOGERR(klogErr, (klogErr, rc, "$(field) for spot '$(s)'", PLOG_2(PLOG_S(field),PLOG_S(s)), field, spot->name->data));
                }
                break;
            }
        }
        if( rc == 0 && k >= spot->nreads ) {
            /* read was not found, adddind new read to this spot */
            rc = IlluminaSpot_Set(spot, spot->nreads++, name, barcode, read);
        }
    } else {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcIgnored);
    }
    return rc;
}
Exemple #3
0
bool FGroup_Find( SLNode *node, void *data )
{
    FGroup* n = (FGroup*)node;
    FGroup_Find_data* d = (FGroup_Find_data*)data;
    IlluminaFileInfo* file = n->files;

    while( file != NULL ) {
        if( pstring_cmp(&file->name, &d->key) == 0 ) {
            d->found = n;
            return true;
        }
        file = file->next;
    }
    if( pstring_cmp(&d->key, &n->key) == 0 ) {
        d->found = n;
        return true;
    }
    return false;
}
Exemple #4
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file)
{
    rc_t rc = 0;

    if( file->spot->ready ) {
        /* data still not used */
        return 0;
    }
    FileReadData_init(file->spot, false);
    FileReadData_init(&file->spot[1], false);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) {
        /* single line forms */
        file->line = NULL; /* line consumed */
        file->spot->ready = true;
    } else  if( file->line[0] == '>' || file->line[0] == '@' ) {
        /* 4 or 8 line format */
        FileReadData sd;
        uint8_t word = 0, best_word = 0;
        uint8_t score = 0, best_score = 0;
        /* find and parse spot name on defline */
        do {
            score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word);
            if( score > best_score ) {
                if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 ||
                    (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name");
                }
                file->spot->read.read_id = sd.read.read_id;
                best_score = score;
                best_word = word; /* used below for quality defline parsing */
            }

        } while(score != 0);
        if( best_score == 0 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected");
        }
        if( file->line[0] == '@' ) {
            if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) {
                return rc;
            }
            /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */
            if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) {
                /* try to find read id on next line */
                FileReadData_init(&file->spot[1], false);
                if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) {
                    if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 &&
                        pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 &&
                        file->spot->read.read_id != file->spot[1].read.read_id ) {
                        /* since it is different read id with same name and barcode, fill up second read */
                        if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) {
                            return rc;
                        }
                    }
                }
            }
        } else {
            /* 2 line seq or quality form */
            file->line = NULL; /* line consumed */
            /* read sequence/quality */
            if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data");
            }
            if( file->spot->read.seq.len == 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data");
            } else if( !pstring_is_fasta(&file->spot->read.seq) ) {
                /* swap */
                if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) {
                    file->spot->read.qual_type = file->qualType;
                    pstring_clear(&file->spot->read.seq);
                }
            }
            file->spot->ready = true;
        }
    } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown");
    }
    if( rc == 0 ) {
        int k;
        for(k = 0; k < 2; k++) {
            FileReadData* rd = &file->spot[k];
            if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
                if( file->qualOffset == 0 ) {
                    /* detect and remember */
                    file->qualOffset = 33;
		    file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    if( GetRCState(rc) == rcOutofrange ) {
                        file->qualOffset = 64;
			file->qualMax = 61;
                        rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    }
                } else {
		    if(file->qualOffset == 33) file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality");
                }
            }
        }
    }
    return 0;
}
Exemple #5
0
static
rc_t IlluminaSpot_Append(IlluminaSpot* spot, int r, const pstring* barcode, const IlluminaRead* read, const char** field)
{
    rc_t rc = 0;
    /* spot->name and spot->reads[r].read_id assumed to be matched with read before call!!! */
    if( r >= ILLUMINAWRITER_MAX_NUM_READS ) {
        *field = "number of reads";
        return RC(rcSRA, rcFormatter, rcCopying, rcData, rcOutofrange);
    }
    if( (spot->barcode == NULL && barcode != NULL) ||
        (spot->barcode != NULL && barcode == NULL) ||
        (spot->barcode != barcode && pstring_cmp(spot->barcode, barcode) != 0) ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent);
        *field = "barcode";
        return rc;
    }
    if( read->seq.len > 0 ) {
        if( spot->reads[r].seq ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
            *field = "sequence";
            return rc;
        } else {
            spot->reads[r].seq = &read->seq;
        }
    }
    if( read->qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
        if( spot->reads[r].qual ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
            *field = "quality";
            return rc;
        }
        if( spot->qual_type != ILLUMINAWRITER_COLMASK_NOTSET && spot->qual_type != read->qual_type ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent);
            *field = "quality type";
            return rc;
        }
        spot->qual_type = read->qual_type;
        spot->reads[r].qual = &read->qual;
    }

    if( read->noise.len > 0 ) {
        if( spot->reads[r].noise ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
            *field = "noise";
            return rc;
        } else {
            spot->reads[r].noise = &read->noise;
        }
    }
    if( read->intensity.len > 0 ) {
        if( spot->reads[r].intensity ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
            *field = "intensity";
            return rc;
        } else {
            spot->reads[r].intensity = &read->intensity;
        }
    }
    if( read->signal.len > 0 ) {
        if( spot->reads[r].signal ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate);
            *field = "signal";
            return rc;
        } else {
            spot->reads[r].signal = &read->signal;
        }
    }
    if( spot->reads[r].filter != NULL ) {
        if( *(spot->reads[r].filter) != read->filter ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent);
            *field = "read_filter";
            return rc;
        }
    } else {
        spot->reads[r].filter = &read->filter;
    }
    return rc;
}
Exemple #6
0
static
rc_t SFFLoaderFmtReadCommonHeader(SFFLoaderFmt* self, const SRALoaderFile *file)
{
    rc_t rc = 0;
    bool skiped_idx_pad = false;
    uint16_t head_sz;
    SFFCommonHeader prev_head;
    pstring prev_flow_chars;
    pstring prev_key_seq;

    if( (rc = SRALoaderFile_Offset(file, &self->index_correction)) != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "Reading initial file position", NULL);
        return rc;
    }
SkipIndexPad:
    self->index_correction += self->file_advance;
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFCommonHeader_size, NULL, true)) != 0) {
        SRALoaderFile_LOG(file, klogErr, rc, "common header, needed $(needed) bytes",
                          PLOG_U32(needed), SFFCommonHeader_size);
        return rc;
    }
    if( self->header.magic_number != 0 ) {
        /* next file in stream, remember prev to sync to each */
        memcpy(&prev_head, &self->header, sizeof(SFFCommonHeader));
        pstring_copy(&prev_flow_chars, &self->flow_chars);
        pstring_copy(&prev_key_seq, &self->key_seq);
    } else {
        prev_head.magic_number = 0;
        prev_head.index_length = 0;
    }

    memcpy(&self->header, self->file_buf, SFFCommonHeader_size);
#if __BYTE_ORDER == __LITTLE_ENDIAN
    self->header.magic_number = bswap_32(self->header.magic_number);
    self->header.version = bswap_32(self->header.version);
    self->header.index_offset = bswap_64(self->header.index_offset);
    self->header.index_length = bswap_32(self->header.index_length);
    self->header.number_of_reads = bswap_32(self->header.number_of_reads);
    self->header.header_length = bswap_16(self->header.header_length);
    self->header.key_length = bswap_16(self->header.key_length);
    self->header.num_flows_per_read = bswap_16(self->header.num_flows_per_read);
#endif

    if( self->header.magic_number != (('.'<<24)|('s'<<16)|('f'<<8)|('f'<<0)) ) {
        if( !skiped_idx_pad && prev_head.magic_number != 0 ) {
            /* possible concatination of 2 files with index at EOF and padded to 8 bytes with header values not padded,
               try skipping padding and reread */
            uint32_t pad = 8 - prev_head.index_length % 8;
            if( pad != 0 ) {
                self->file_advance += pad;
                DEBUG_MSG(5, ("%s: trying to skip over %u bytes index section padding\n", self->file_name, pad));
                skiped_idx_pad = true;
                goto SkipIndexPad;
            }
        }
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized);
        SRALoaderFile_LOG(file, klogErr, rc, "magic number: $(m)", PLOG_U32(m), self->header.magic_number);
        return rc;
    }
    if( self->header.version != 1 ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcBadVersion);
        SRALoaderFile_LOG(file, klogErr, rc, "format version $(v)", PLOG_U32(v), self->header.version);
        return rc;
    }
    if( self->header.flowgram_format_code != SFFFormatCodeUI16Hundreths ) {
        /* NOTE: add a case here if flowgram coding gets new version to support different */
        rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported);
        SRALoaderFile_LOG(file, klogErr, rc, "common header flowgram format code", NULL);
        return rc;
    }
    if( self->header.index_length % 8 != 0 ) {
        DEBUG_MSG(5, ("%s: index_length field value is not 8 byte padded: %u\n", self->file_name, self->header.index_length));
    }
    head_sz = SFFCommonHeader_size + self->header.num_flows_per_read + self->header.key_length;
    head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0;
    if( head_sz != self->header.header_length ) {
        rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid);
        SRALoaderFile_LOG(file, klogErr, rc, "header length $(h) <> $(s) ", PLOG_2(PLOG_U16(h),PLOG_U16(s)),
                          self->header.header_length, head_sz);
        return rc;
    }
    /* read flow chars and key */
    self->file_advance = SFFCommonHeader_size;
    if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFCommonHeader_size, "common header", false)) != 0) {
        return rc;
    }
    self->file_advance = head_sz - SFFCommonHeader_size;

    if( (rc = pstring_assign(&self->flow_chars, self->file_buf, self->header.num_flows_per_read)) != 0 ||
        (rc = pstring_assign(&self->key_seq, self->file_buf + self->header.num_flows_per_read, self->header.key_length)) != 0 ) {
        SRALoaderFile_LOG(file, klogErr, rc, "reading flows/key sequence", NULL);
        return rc;
    }
    if( prev_head.magic_number != 0 ) {
        /* next file's common header must match previous file's common header, partially */
        if( prev_head.key_length != self->header.key_length ||
            prev_head.num_flows_per_read != self->header.num_flows_per_read ||
            pstring_cmp(&prev_flow_chars, &self->flow_chars) != 0 ||
            pstring_cmp(&prev_key_seq, &self->key_seq) != 0 ) {
                rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInconsistent);
                SRALoaderFile_LOG(file, klogErr, rc, "previous file common header differ in flows/key sequence", NULL);
        }
    }
    if( rc == 0 ) {
        if( self->w454 ) {
            rc = SRAWriter454_WriteHead(self->w454, &self->flow_chars, &self->key_seq);
        } else {
            rc = SRAWriterIonTorrent_WriteHead(self->wIonTorrent, &self->flow_chars, &self->key_seq);
        }
    }
    return rc;
}