static rc_t read_spot_data_3lines(FastqFileInfo* file, FileReadData* sd, uint8_t best_word, uint8_t best_score, int qualType) { rc_t rc = 0; file->line = NULL; /* discard defline */ /* read sequence */ if( (rc = read_multiline_seq_or_qual(file, '+', &sd->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence data"); } if( !pstring_is_fasta(&sd->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected sequence data"); } /* next defline */ if( (rc = file_read_line(file, false)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline"); } if( file->line[0] != '+' ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '+' on quality defline"); } if( file->line_len != 1 ) { /* there may be just '+' on quality defline */ FileReadData d; uint8_t score = parse_spot_name(file->file, &d, &file->line[1], file->line_len - 1, best_word); /* sometimes quality defline may NOT contain barcode and readid, so score will be lower than bestscore, but must be at least == 1 with none empty line, which means that name was found */ if( score < 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not found"); } if( pstring_cmp(&sd->name, &d.name) != 0 || (score == best_score && (pstring_cmp(&sd->barcode, &d.barcode) != 0 || sd->read.read_id != d.read.read_id)) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality defline do not match sequence defline"); } } file->line = NULL; /* discard defline */ if( (rc = read_multiline_seq_or_qual(file, '@', &sd->read.qual)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=failed to read quality"); } if( sd->read.qual.len <= 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcEmpty); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality"); } sd->read.qual_type = qualType; sd->ready = true; return 0; }
rc_t IlluminaSpot_Add(IlluminaSpot* spot, const pstring* name, const pstring* barcode, const IlluminaRead* read) { rc_t rc = 0; if( spot->nreads == 0 ) { rc = IlluminaSpot_Set(spot, spot->nreads++, name, barcode, read); } else if( pstring_cmp(spot->name, name) == 0 ) { /* look if same read_id was already seen in this spot */ int32_t k; for(k = 0; k < spot->nreads; k++) { if( spot->reads[k].read_id == read->read_id ) { const char* field; rc = IlluminaSpot_Append(spot, k, barcode, read, &field); if( GetRCState(rc) == rcDuplicate && read->read_id == ILLUMINAWRITER_READID_NONE ) { /* may be it is the case when readids are missing on defline and these are separate reads */ k = spot->nreads + 1; rc = 0; } else if( rc != 0 ) { PLOGERR(klogErr, (klogErr, rc, "$(field) for spot '$(s)'", PLOG_2(PLOG_S(field),PLOG_S(s)), field, spot->name->data)); } break; } } if( rc == 0 && k >= spot->nreads ) { /* read was not found, adddind new read to this spot */ rc = IlluminaSpot_Set(spot, spot->nreads++, name, barcode, read); } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcIgnored); } return rc; }
bool FGroup_Find( SLNode *node, void *data ) { FGroup* n = (FGroup*)node; FGroup_Find_data* d = (FGroup_Find_data*)data; IlluminaFileInfo* file = n->files; while( file != NULL ) { if( pstring_cmp(&file->name, &d->key) == 0 ) { d->found = n; return true; } file = file->next; } if( pstring_cmp(&d->key, &n->key) == 0 ) { d->found = n; return true; } return false; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file) { rc_t rc = 0; if( file->spot->ready ) { /* data still not used */ return 0; } FileReadData_init(file->spot, false); FileReadData_init(&file->spot[1], false); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) { /* single line forms */ file->line = NULL; /* line consumed */ file->spot->ready = true; } else if( file->line[0] == '>' || file->line[0] == '@' ) { /* 4 or 8 line format */ FileReadData sd; uint8_t word = 0, best_word = 0; uint8_t score = 0, best_score = 0; /* find and parse spot name on defline */ do { score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word); if( score > best_score ) { if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 || (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name"); } file->spot->read.read_id = sd.read.read_id; best_score = score; best_word = word; /* used below for quality defline parsing */ } } while(score != 0); if( best_score == 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected"); } if( file->line[0] == '@' ) { if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) { return rc; } /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */ if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) { /* try to find read id on next line */ FileReadData_init(&file->spot[1], false); if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) { if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 && pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 && file->spot->read.read_id != file->spot[1].read.read_id ) { /* since it is different read id with same name and barcode, fill up second read */ if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) { return rc; } } } } } else { /* 2 line seq or quality form */ file->line = NULL; /* line consumed */ /* read sequence/quality */ if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data"); } if( file->spot->read.seq.len == 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data"); } else if( !pstring_is_fasta(&file->spot->read.seq) ) { /* swap */ if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) { file->spot->read.qual_type = file->qualType; pstring_clear(&file->spot->read.seq); } } file->spot->ready = true; } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown"); } if( rc == 0 ) { int k; for(k = 0; k < 2; k++) { FileReadData* rd = &file->spot[k]; if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( file->qualOffset == 0 ) { /* detect and remember */ file->qualOffset = 33; file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); if( GetRCState(rc) == rcOutofrange ) { file->qualOffset = 64; file->qualMax = 61; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } } else { if(file->qualOffset == 33) file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality"); } } } } return 0; }
static rc_t IlluminaSpot_Append(IlluminaSpot* spot, int r, const pstring* barcode, const IlluminaRead* read, const char** field) { rc_t rc = 0; /* spot->name and spot->reads[r].read_id assumed to be matched with read before call!!! */ if( r >= ILLUMINAWRITER_MAX_NUM_READS ) { *field = "number of reads"; return RC(rcSRA, rcFormatter, rcCopying, rcData, rcOutofrange); } if( (spot->barcode == NULL && barcode != NULL) || (spot->barcode != NULL && barcode == NULL) || (spot->barcode != barcode && pstring_cmp(spot->barcode, barcode) != 0) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); *field = "barcode"; return rc; } if( read->seq.len > 0 ) { if( spot->reads[r].seq ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate); *field = "sequence"; return rc; } else { spot->reads[r].seq = &read->seq; } } if( read->qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( spot->reads[r].qual ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate); *field = "quality"; return rc; } if( spot->qual_type != ILLUMINAWRITER_COLMASK_NOTSET && spot->qual_type != read->qual_type ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); *field = "quality type"; return rc; } spot->qual_type = read->qual_type; spot->reads[r].qual = &read->qual; } if( read->noise.len > 0 ) { if( spot->reads[r].noise ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate); *field = "noise"; return rc; } else { spot->reads[r].noise = &read->noise; } } if( read->intensity.len > 0 ) { if( spot->reads[r].intensity ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate); *field = "intensity"; return rc; } else { spot->reads[r].intensity = &read->intensity; } } if( read->signal.len > 0 ) { if( spot->reads[r].signal ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcDuplicate); *field = "signal"; return rc; } else { spot->reads[r].signal = &read->signal; } } if( spot->reads[r].filter != NULL ) { if( *(spot->reads[r].filter) != read->filter ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); *field = "read_filter"; return rc; } } else { spot->reads[r].filter = &read->filter; } return rc; }
static rc_t SFFLoaderFmtReadCommonHeader(SFFLoaderFmt* self, const SRALoaderFile *file) { rc_t rc = 0; bool skiped_idx_pad = false; uint16_t head_sz; SFFCommonHeader prev_head; pstring prev_flow_chars; pstring prev_key_seq; if( (rc = SRALoaderFile_Offset(file, &self->index_correction)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "Reading initial file position", NULL); return rc; } SkipIndexPad: self->index_correction += self->file_advance; if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFCommonHeader_size, NULL, true)) != 0) { SRALoaderFile_LOG(file, klogErr, rc, "common header, needed $(needed) bytes", PLOG_U32(needed), SFFCommonHeader_size); return rc; } if( self->header.magic_number != 0 ) { /* next file in stream, remember prev to sync to each */ memcpy(&prev_head, &self->header, sizeof(SFFCommonHeader)); pstring_copy(&prev_flow_chars, &self->flow_chars); pstring_copy(&prev_key_seq, &self->key_seq); } else { prev_head.magic_number = 0; prev_head.index_length = 0; } memcpy(&self->header, self->file_buf, SFFCommonHeader_size); #if __BYTE_ORDER == __LITTLE_ENDIAN self->header.magic_number = bswap_32(self->header.magic_number); self->header.version = bswap_32(self->header.version); self->header.index_offset = bswap_64(self->header.index_offset); self->header.index_length = bswap_32(self->header.index_length); self->header.number_of_reads = bswap_32(self->header.number_of_reads); self->header.header_length = bswap_16(self->header.header_length); self->header.key_length = bswap_16(self->header.key_length); self->header.num_flows_per_read = bswap_16(self->header.num_flows_per_read); #endif if( self->header.magic_number != (('.'<<24)|('s'<<16)|('f'<<8)|('f'<<0)) ) { if( !skiped_idx_pad && prev_head.magic_number != 0 ) { /* possible concatination of 2 files with index at EOF and padded to 8 bytes with header values not padded, try skipping padding and reread */ uint32_t pad = 8 - prev_head.index_length % 8; if( pad != 0 ) { self->file_advance += pad; DEBUG_MSG(5, ("%s: trying to skip over %u bytes index section padding\n", self->file_name, pad)); skiped_idx_pad = true; goto SkipIndexPad; } } rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); SRALoaderFile_LOG(file, klogErr, rc, "magic number: $(m)", PLOG_U32(m), self->header.magic_number); return rc; } if( self->header.version != 1 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcBadVersion); SRALoaderFile_LOG(file, klogErr, rc, "format version $(v)", PLOG_U32(v), self->header.version); return rc; } if( self->header.flowgram_format_code != SFFFormatCodeUI16Hundreths ) { /* NOTE: add a case here if flowgram coding gets new version to support different */ rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(file, klogErr, rc, "common header flowgram format code", NULL); return rc; } if( self->header.index_length % 8 != 0 ) { DEBUG_MSG(5, ("%s: index_length field value is not 8 byte padded: %u\n", self->file_name, self->header.index_length)); } head_sz = SFFCommonHeader_size + self->header.num_flows_per_read + self->header.key_length; head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0; if( head_sz != self->header.header_length ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid); SRALoaderFile_LOG(file, klogErr, rc, "header length $(h) <> $(s) ", PLOG_2(PLOG_U16(h),PLOG_U16(s)), self->header.header_length, head_sz); return rc; } /* read flow chars and key */ self->file_advance = SFFCommonHeader_size; if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFCommonHeader_size, "common header", false)) != 0) { return rc; } self->file_advance = head_sz - SFFCommonHeader_size; if( (rc = pstring_assign(&self->flow_chars, self->file_buf, self->header.num_flows_per_read)) != 0 || (rc = pstring_assign(&self->key_seq, self->file_buf + self->header.num_flows_per_read, self->header.key_length)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "reading flows/key sequence", NULL); return rc; } if( prev_head.magic_number != 0 ) { /* next file's common header must match previous file's common header, partially */ if( prev_head.key_length != self->header.key_length || prev_head.num_flows_per_read != self->header.num_flows_per_read || pstring_cmp(&prev_flow_chars, &self->flow_chars) != 0 || pstring_cmp(&prev_key_seq, &self->key_seq) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInconsistent); SRALoaderFile_LOG(file, klogErr, rc, "previous file common header differ in flows/key sequence", NULL); } } if( rc == 0 ) { if( self->w454 ) { rc = SRAWriter454_WriteHead(self->w454, &self->flow_chars, &self->key_seq); } else { rc = SRAWriterIonTorrent_WriteHead(self->wIonTorrent, &self->flow_chars, &self->key_seq); } } return rc; }