static rc_t fe_new_read(fe_context_t *self, pstring *readId, EAbisolidReadType* type, pstring* label) { rc_t rc = 0; pstring name_suffix; const char* p; assert(self && readId && type && label); DEBUG_MSG(3, ("READ_LABEL: '%s'\n", readId->data)); /* spot name suffix may end with '_(F|R).+' */ p = strrchr(readId->data, '_'); if( p != NULL ) { rc = set_label_type(p + 1, label, type); if( rc == 0 && *type > eAbisolidReadType_SPOT) { /* cut label */ readId->len -= label->len + 1; } } else { pstring_clear(label); *type = eAbisolidReadType_SPOT; } if( rc == 0 && (rc = pstring_copy(&name_suffix, readId)) == 0 ) { rc = SRAWriteAbsolid_MakeName(&self->name_prefix, &name_suffix, readId); } return rc; }
static rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId ) { rc_t rc; char *suffix; pstring readName, spotGroup; static IlluminaSpot spot; /* look for spot group */ suffix = strchr(readId->data, '#'); if( suffix != NULL ) { readId->len = suffix++ - readId->data; if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogInt, rc, "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data); return rc; } } else { pstring_clear(&spotGroup); } /* build the read name from prefix (self->name_prefix) and read id */ if(self->name_prefix.len > 0 ) { if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) { if( isdigit(readName.data[readName.len - 1]) ) { rc = pstring_append(&readName, ":", 1); } if( rc == 0 ) { rc = pstring_concat(&readName, readId); } } } else { rc = pstring_copy(&readName, readId); } if( rc != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "preparing spot name $(spotname)", "spotname=%s", readId->data); return rc; } SRF_set_read_filter(&self->read.filter, flags); IlluminaSpot_Init(&spot); if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) { rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot); } return rc; }
rc_t pstring_quality_convert(pstring* qstr, ExperimentQualityEncoding enc, const uint8_t offset, const int8_t min, const int8_t max) { rc_t rc = 0; char* c, *end, *next; pstring qbin; if( qstr == NULL || min > max ) { rc = RC(rcSRA, rcFormatter, rcReading, rcParam, rcInvalid); } errno = 0; c = qstr->data; end = qstr->data + qstr->len; pstring_clear(&qbin); if(enc == eExperimentQualityEncoding_Undefined) { if(memchr(c, ' ', qstr->len) != NULL || memchr(c, '\t', qstr->len) != NULL){ enc = eExperimentQualityEncoding_Decimal; } else { enc = eExperimentQualityEncoding_Ascii; } } while( rc == 0 && c < end ) { long q; switch(enc) { case eExperimentQualityEncoding_Decimal: case eExperimentQualityEncoding_Hexadecimal: /* spaced numbers form */ errno = 0; q = strtol(c, &next, enc == eExperimentQualityEncoding_Decimal ? 10 : 16); if( q == 0 && c == next ) { /* no more digits in line */ goto DONE; /*** need do break while loop as well ***/ } c = next; break; case eExperimentQualityEncoding_Ascii: /* textual form with offset */ q = (long)(*c++) - offset; break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); break; } if( rc == 0 ) { if( errno != 0 || q < min || q > max ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcOutofrange); } else { rc = pstring_append_chr(&qbin, (int8_t)q, 1); } } } DONE: if( rc == 0 ) { rc = pstring_copy(qstr, &qbin); } return rc; }
rc_t SRAWriteAbsolid_MakeName(const pstring* prefix, const pstring* suffix, pstring* name) { rc_t rc = 0; if( prefix == NULL || name == NULL ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcParam, rcNull); } else if( (rc = pstring_copy(name, prefix)) == 0 ) { if( suffix && suffix->len > 0 ) { if( name->len > 0 && name->data[name->len - 1] != '_' && suffix->data[0] != '_' ) { rc = pstring_append(name, "_", 1); } if( rc == 0 ) { pstring_concat(name, suffix); } } } if( rc != 0 ) { LOGERR(klogErr, rc, "preparing spot name"); } return rc; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file) { rc_t rc = 0; if( file->spot->ready ) { /* data still not used */ return 0; } FileReadData_init(file->spot, false); FileReadData_init(&file->spot[1], false); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) { /* single line forms */ file->line = NULL; /* line consumed */ file->spot->ready = true; } else if( file->line[0] == '>' || file->line[0] == '@' ) { /* 4 or 8 line format */ FileReadData sd; uint8_t word = 0, best_word = 0; uint8_t score = 0, best_score = 0; /* find and parse spot name on defline */ do { score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word); if( score > best_score ) { if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 || (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name"); } file->spot->read.read_id = sd.read.read_id; best_score = score; best_word = word; /* used below for quality defline parsing */ } } while(score != 0); if( best_score == 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected"); } if( file->line[0] == '@' ) { if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) { return rc; } /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */ if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) { /* try to find read id on next line */ FileReadData_init(&file->spot[1], false); if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) { if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 && pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 && file->spot->read.read_id != file->spot[1].read.read_id ) { /* since it is different read id with same name and barcode, fill up second read */ if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) { return rc; } } } } } else { /* 2 line seq or quality form */ file->line = NULL; /* line consumed */ /* read sequence/quality */ if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data"); } if( file->spot->read.seq.len == 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data"); } else if( !pstring_is_fasta(&file->spot->read.seq) ) { /* swap */ if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) { file->spot->read.qual_type = file->qualType; pstring_clear(&file->spot->read.seq); } } file->spot->ready = true; } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown"); } if( rc == 0 ) { int k; for(k = 0; k < 2; k++) { FileReadData* rd = &file->spot[k]; if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( file->qualOffset == 0 ) { /* detect and remember */ file->qualOffset = 33; file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); if( GetRCState(rc) == rcOutofrange ) { file->qualOffset = 64; file->qualMax = 61; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } } else { if(file->qualOffset == 33) file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality"); } } } } return 0; }
static rc_t IlluminaLoaderFmt_WriteData(IlluminaLoaderFmt* self, uint32_t argc, const SRALoaderFile* const argv[], int64_t* spots_bad_count) { rc_t rc = 0; uint32_t t, i, k, ftype_q = sizeof(file_types) / sizeof(file_types[0]); SLList files; IlluminaFileInfo* file = NULL; SLListInit(&files); /* group files using spotname, for _prb. file name prefix is used, files reviewed by type detected from name and ordered by file_type array */ for(t = 0; rc == 0 && t < ftype_q; t++) { for(i = 0; rc == 0 && i < argc; i++) { const char* fname, *blk_pfx; int prefix_len = 0; ERunFileType ftype; EIlluminaNativeFileType type = eIlluminaNativeFileTypeNotSet; FGroup_Find_data data; if( (rc = SRALoaderFileName(argv[i], &fname)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file name", NULL); break; } if( (rc = SRALoaderFile_FileType(argv[i], &ftype)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file type", NULL); break; } if( (rc = SRALoaderFileBlockName(argv[i], &blk_pfx)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading DATA_BLOCK/@name", NULL); break; } if( blk_pfx == NULL ) { blk_pfx = ""; } {{ /* skip path if present */ const char* p = strrchr(fname, '/'); fname = p ? p + 1 : fname; p = NULL; for(k = 0; type == eIlluminaNativeFileTypeNotSet && k < ftype_q; k++) { const char* const* e = file_types[k].key; while( *e != NULL ) { p = strstr(fname, *e++); if( p != NULL ) { type = file_types[k].type; break; } } } if( p != NULL ) { prefix_len = p - fname; } }} if( ftype == rft_IlluminaNativeSeq ) { type = eIlluminaNativeFileTypeFasta; } else if( ftype == rft_IlluminaNativePrb ) { type = eIlluminaNativeFileTypeQuality4; } else if( ftype == rft_IlluminaNativeInt ) { type = eIlluminaNativeFileTypeIntensity; } else if( ftype == rft_IlluminaNativeQseq ) { type = eIlluminaNativeFileTypeQSeq; } if( type == eIlluminaNativeFileTypeNotSet ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); SRALoaderFile_LOG(argv[i], klogErr, rc, "detecting file type by file name", NULL); break; } if( type != file_types[t].type ) { /* one type at a time */ continue; } DEBUG_MSG(3, ("file '%s' type set to %d\n", fname, type)); file = calloc(1, sizeof(*file)); if( file == NULL ) { rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcExhausted); SRALoaderFile_LOG(argv[i], klogErr, rc, "allocating file object", NULL); break; } IlluminaFileInfo_init(file); file->file = argv[i]; file->type = type; if( file->type == eIlluminaNativeFileTypeQuality4 ) { /* in _prb there is no spotname inside so use file prefix */ rc = pstring_assign(&data.key, fname, prefix_len); } else { /* try to get 1st spot so group can be organized by spot name */ if( (rc = read_next_spot(blk_pfx, file)) != 0 || !file->ready ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcNotFound); SRALoaderFile_LOG(argv[i], klogErr, rc, "reading 1st spot", NULL); break; } rc = pstring_copy(&data.key, &file->name); } data.found = NULL; if( SLListDoUntil(&files, FGroup_Find, &data) && data.found != NULL ) { IlluminaFileInfo* ss = data.found->files; while( rc == 0 && file != NULL ) { if( ss->type != eIlluminaNativeFileTypeQSeq && ss->type == file->type ) { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcDuplicate); SRALoaderFile_LOG(argv[i], klogErr, rc, "type of file for lane", NULL); } else if( ss->next != NULL ) { ss = ss->next; } else { ss->next = file; file->prev = ss; data.found->mask |= file->type; file = NULL; } } } else { data.found = calloc(1, sizeof(*data.found)); if( data.found == NULL ) { rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcInsufficient); SRALoaderFile_LOG(argv[i], klogErr, rc, "preparing file group", NULL); break; } else { if( (rc = pstring_assign(&data.found->key, fname, prefix_len)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "setting file group key", NULL); FGroup_Whack(&data.found->dad, NULL); break; } else { FGroup* curr = (FGroup*)SLListHead(&files), *prev = NULL; data.found->blk_pfx = blk_pfx; data.found->files = file; data.found->mask = file->type; /* group inserted into list by coords in 1st spot */ while( curr != NULL ) { if( curr->files[0].coord[0] > file->coord[0] || (curr->files[0].coord[0] == file->coord[0] && curr->files[0].coord[1] > file->coord[1]) ) { data.found->dad.next = &curr->dad; if( prev == NULL ) { files.head = &data.found->dad; } else { prev->dad.next = &data.found->dad; } break; } prev = curr; curr = (FGroup*)curr->dad.next; } if( curr == NULL ) { SLListPushTail(&files, &data.found->dad); } file = NULL; } } } } } if( rc == 0 ) { SLListForEach(&files, FGroup_Validate, &rc); } if( rc == 0 ) { FGroup_Parse_data data; data.self = self; if( SLListDoUntil(&files, FGroup_Parse, &data) ) { rc = data.rc; } } else { free(file); } SLListWhack(&files, FGroup_Whack, NULL); *spots_bad_count = self->spots_bad_count; return rc; }
bool FGroup_Parse( SLNode *n, void *d ) { FGroup_Parse_data* data = (FGroup_Parse_data*)d; FGroup* g = (FGroup*)n; bool done; const SRALoaderFile* data_block_ref = NULL; data->rc = 0; do { IlluminaFileInfo* file = g->files; done = true; while( data->rc == 0 && file != NULL ) { if( (data->rc = read_next_spot(g->blk_pfx, file)) == 0 && file->ready ) { done = false; } file = file->next; } if( data->rc != 0 || done ) { break; } /* collect spot reads, matching by spot name * spot data may be split across multiple files */ IlluminaSpot_Init(&data->spot); file = g->files; while( data->rc == 0 && file != NULL ) { if( file->ready ) { if( (file->type == eIlluminaNativeFileTypeNoise && data->self->skip_noise) || (file->type == eIlluminaNativeFileTypeIntensity && data->self->skip_intensity) || (file->type == eIlluminaNativeFileTypeSignal && data->self->skip_signal) ) { file->ready = false; } else { data_block_ref = file->file; if( file->type == eIlluminaNativeFileTypeQSeq && (g->mask & eIlluminaNativeFileTypeQuality4) ) { /* drop quality1 from qseq data */ pstring_clear(&file->read.qual); } else if( file->type == eIlluminaNativeFileTypeQuality4 ) { IlluminaFileInfo* neib = file->next ? file->next : file->prev; /* need to fix spotname to be same cause prb do not have any name in it */ if( (data->rc = pstring_copy(&file->name, &neib->name)) != 0 ) { SRALoaderFile_LOG(file->file, klogErr, data->rc, "$(msg) '$(n)'", "msg=syncing prb spot name,n=%s", neib->name.data); } } if( data->rc == 0 ) { data->rc = IlluminaSpot_Add(&data->spot, &file->name, &file->barcode, &file->read); if( data->rc == 0 ) { file->ready = false; } else { if( GetRCState(data->rc) == rcIgnored ) { SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(s1)' <> '$(s2)'", "msg=spot name mismatch,s1=%.*s,s2=%.*s", data->spot.name->len, data->spot.name->data, file->name.len, file->name.data); data->self->spots_bad_count++; /* skip spot for all files in a group */ file = g->files; while( file != NULL ) { file->ready = false; SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(n)'", "msg=skipped spot,n=%s", file->name.data); file = file->next; } if( data->self->spots_bad_allowed >= 0 && data->self->spots_bad_count > data->self->spots_bad_allowed ) { data->rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } break; } } } } } file = file->next; } if( GetRCState(data->rc) == rcIgnored ) { data->rc = 0; continue; } if( data->rc == 0 ) { data->rc = SRAWriterIllumina_Write(data->self->writer, data_block_ref, &data->spot); } } while( data->rc == 0 ); return data->rc != 0; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file) { rc_t rc = 0; const char* tail = file->line; if( file->ready ) { /* data still not used */ return 0; } IlluminaFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } switch( file->type ) { case eIlluminaNativeFileTypeQSeq: if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq"); } break; case eIlluminaNativeFileTypeFasta: case eIlluminaNativeFileTypeNoise: case eIlluminaNativeFileTypeIntensity: case eIlluminaNativeFileTypeSignal: {{ /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */ if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) { if( blk_pfx != NULL ) { pstring tmp_name; if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 && (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 && (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_concat(&file->name, &tmp_name); } } } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord"); } break; }} case eIlluminaNativeFileTypeQuality4: if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4"); } break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line"); break; } /* process tail (after coords) for some file types */ file->line_len -= tail - file->line; /* length of tail */ switch( file->type ) { case eIlluminaNativeFileTypeQSeq: case eIlluminaNativeFileTypeQuality4: default: /* completely processed before */ break; case eIlluminaNativeFileTypeFasta: if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 || !pstring_is_fasta(&file->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta"); } break; case eIlluminaNativeFileTypeNoise: if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise"); } break; case eIlluminaNativeFileTypeIntensity: if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity"); } break; case eIlluminaNativeFileTypeSignal: if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal"); } break; } file->line = NULL; file->ready = true; #if _DEBUGGING DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, file->coord[0], file->coord[1], file->coord[2], file->coord[3])); if( file->read.seq.len ) { DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data)); } if( file->read.qual.len ) { DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len)); } #endif return 0; }
static rc_t SFFLoaderFmtReadCommonHeader(SFFLoaderFmt* self, const SRALoaderFile *file) { rc_t rc = 0; bool skiped_idx_pad = false; uint16_t head_sz; SFFCommonHeader prev_head; pstring prev_flow_chars; pstring prev_key_seq; if( (rc = SRALoaderFile_Offset(file, &self->index_correction)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "Reading initial file position", NULL); return rc; } SkipIndexPad: self->index_correction += self->file_advance; if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFCommonHeader_size, NULL, true)) != 0) { SRALoaderFile_LOG(file, klogErr, rc, "common header, needed $(needed) bytes", PLOG_U32(needed), SFFCommonHeader_size); return rc; } if( self->header.magic_number != 0 ) { /* next file in stream, remember prev to sync to each */ memcpy(&prev_head, &self->header, sizeof(SFFCommonHeader)); pstring_copy(&prev_flow_chars, &self->flow_chars); pstring_copy(&prev_key_seq, &self->key_seq); } else { prev_head.magic_number = 0; prev_head.index_length = 0; } memcpy(&self->header, self->file_buf, SFFCommonHeader_size); #if __BYTE_ORDER == __LITTLE_ENDIAN self->header.magic_number = bswap_32(self->header.magic_number); self->header.version = bswap_32(self->header.version); self->header.index_offset = bswap_64(self->header.index_offset); self->header.index_length = bswap_32(self->header.index_length); self->header.number_of_reads = bswap_32(self->header.number_of_reads); self->header.header_length = bswap_16(self->header.header_length); self->header.key_length = bswap_16(self->header.key_length); self->header.num_flows_per_read = bswap_16(self->header.num_flows_per_read); #endif if( self->header.magic_number != (('.'<<24)|('s'<<16)|('f'<<8)|('f'<<0)) ) { if( !skiped_idx_pad && prev_head.magic_number != 0 ) { /* possible concatination of 2 files with index at EOF and padded to 8 bytes with header values not padded, try skipping padding and reread */ uint32_t pad = 8 - prev_head.index_length % 8; if( pad != 0 ) { self->file_advance += pad; DEBUG_MSG(5, ("%s: trying to skip over %u bytes index section padding\n", self->file_name, pad)); skiped_idx_pad = true; goto SkipIndexPad; } } rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); SRALoaderFile_LOG(file, klogErr, rc, "magic number: $(m)", PLOG_U32(m), self->header.magic_number); return rc; } if( self->header.version != 1 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcBadVersion); SRALoaderFile_LOG(file, klogErr, rc, "format version $(v)", PLOG_U32(v), self->header.version); return rc; } if( self->header.flowgram_format_code != SFFFormatCodeUI16Hundreths ) { /* NOTE: add a case here if flowgram coding gets new version to support different */ rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(file, klogErr, rc, "common header flowgram format code", NULL); return rc; } if( self->header.index_length % 8 != 0 ) { DEBUG_MSG(5, ("%s: index_length field value is not 8 byte padded: %u\n", self->file_name, self->header.index_length)); } head_sz = SFFCommonHeader_size + self->header.num_flows_per_read + self->header.key_length; head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0; if( head_sz != self->header.header_length ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid); SRALoaderFile_LOG(file, klogErr, rc, "header length $(h) <> $(s) ", PLOG_2(PLOG_U16(h),PLOG_U16(s)), self->header.header_length, head_sz); return rc; } /* read flow chars and key */ self->file_advance = SFFCommonHeader_size; if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFCommonHeader_size, "common header", false)) != 0) { return rc; } self->file_advance = head_sz - SFFCommonHeader_size; if( (rc = pstring_assign(&self->flow_chars, self->file_buf, self->header.num_flows_per_read)) != 0 || (rc = pstring_assign(&self->key_seq, self->file_buf + self->header.num_flows_per_read, self->header.key_length)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "reading flows/key sequence", NULL); return rc; } if( prev_head.magic_number != 0 ) { /* next file's common header must match previous file's common header, partially */ if( prev_head.key_length != self->header.key_length || prev_head.num_flows_per_read != self->header.num_flows_per_read || pstring_cmp(&prev_flow_chars, &self->flow_chars) != 0 || pstring_cmp(&prev_key_seq, &self->key_seq) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInconsistent); SRALoaderFile_LOG(file, klogErr, rc, "previous file common header differ in flows/key sequence", NULL); } } if( rc == 0 ) { if( self->w454 ) { rc = SRAWriter454_WriteHead(self->w454, &self->flow_chars, &self->key_seq); } else { rc = SRAWriterIonTorrent_WriteHead(self->wIonTorrent, &self->flow_chars, &self->key_seq); } } return rc; }
static rc_t parse_v1_read(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size) { rc_t rc = 0; size_t i, parsed; ztr_raw_t ztr_raw; ztr_t ztr; enum ztr_chunk_type type; fe_context_t* fe = (fe_context_t*)ctx; uint8_t flags; pstring readId; EAbisolidReadType read_type; pstring label; AbsolidRead read[ABSOLID_FMT_MAX_NUM_READS]; if( fe->region.nreads == 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcNotFound); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing region chunk before 1st read chunk", NULL); } if( (rc = SRF_ParseReadChunk(data, size, &parsed, &flags, &readId)) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rc); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "SRF parsing failure", NULL); } ABI_ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed); /* readId will have spotname */ if( (rc = fe_new_read(fe, &readId, &read_type, &label)) != 0 ) { return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parsing spot name suffix", NULL); } for(i = 0; i < sizeof(read) / sizeof(read[0]); i++) { AbsolidRead_Init(&read[i]); } while(!ABI_ZTR_BufferIsEmpty(ztr_ctx)) { if( (rc = ABI_ZTR_ParseBlock(ztr_ctx, &ztr_raw)) != 0 || (rc = ABI_ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "ZTR parsing failure", NULL); break; } switch (type) { case BASE: if(ztr.sequence->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "read: expected 8-bit datatype", NULL); } else if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; if( (rc = pstring_assign(&read[read_number].seq, ztr.sequence->data, ztr.sequence->datasize)) == 0 ) { /* grab 1st, may be the only cs_key */ read[read_number].cs_key = fe->region.cs_key[0]; for(i = 1; i < fe->region.nreads; i++) { if( read_type == fe->region.type[i] ) { read[read_number].cs_key = fe->region.cs_key[i]; break; } } SRF_set_read_filter(&read[read_number].filter, flags); rc = pstring_copy(&read[read_number].label, &label); DEBUG_MSG(3, ("SRF READ: '%s'\n", read[read_number].seq.data)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying read", NULL); } } else { for(i = 0; rc == 0 && i < fe->region.nreads; i++) { int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads ? ztr.sequence->datasize : fe->region.start[i + 1]) - fe->region.start[i]; rc = pstring_assign(&read[read_number].seq, &ztr.sequence->data[fe->region.start[i]], len); read[read_number].cs_key = fe->region.cs_key[i]; SRF_set_read_filter(&read[read_number].filter, flags); if( fe->region.label[i].len != 0 ) { rc = pstring_copy(&read[read_number].label, &fe->region.label[i]); } DEBUG_MSG(3, ("SRF READ[%u]: '%s'\n", i, read[read_number].seq.data)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying reads", NULL); } } break; case CNF1: if(ztr.quality1->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "quality: expected 8-bit datatype", NULL); } else if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; if( (rc = pstring_assign(&read[read_number].qual, ztr.quality1->data, ztr.quality1->datasize)) == 0 ) { DEBUG_MSG(3, ("SRF QUAL: %u bytes\n", read[read_number].qual.len)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying quality", NULL); } } else { for(i = 0; rc == 0 && i < fe->region.nreads; i++) { int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads ? ztr.quality1->datasize : fe->region.start[i + 1]) - fe->region.start[i]; rc = pstring_assign(&read[read_number].qual, &ztr.quality1->data[fe->region.start[i]], len); DEBUG_MSG(3, ("SRF QUAL[%u]: %u bytes\n", i, read[read_number].qual.len)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying qualities", NULL); } } break; case SAMP: if( !fe->skip_signal ) { size_t i; int stype = ABSOLID_FMT_COLMASK_NOTSET; if(ztr.signal->datatype != f32) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: expected 32-bit float datatype", NULL); } else if( (ztr.signal->datasize % sizeof(float)) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInvalid); SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: size not 32-bit float aligned", NULL); } else if (ztr.signal->channel == NULL) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcIncomplete); SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: missing channel type", NULL); } else if(strcmp(ztr.signal->channel, "0FAM") == 0) { stype = ABSOLID_FMT_COLMASK_FAM; } else if(strcmp(ztr.signal->channel, "1CY3") == 0) { stype = ABSOLID_FMT_COLMASK_CY3; } else if(strcmp(ztr.signal->channel, "2TXR") == 0) { stype = ABSOLID_FMT_COLMASK_TXR; } else if(strcmp(ztr.signal->channel, "3CY5") == 0) { stype = ABSOLID_FMT_COLMASK_CY5; } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: unexpected channel type", NULL); } #if __BYTE_ORDER == __LITTLE_ENDIAN for(i = 0; rc == 0 && i < ztr.signal->datasize; i += 4) { uint32_t* r = (uint32_t*)&ztr.signal->data[i]; *r = bswap_32(*r); } #endif if( rc == 0 ) { if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; pstring* d = NULL; switch(stype) { case ABSOLID_FMT_COLMASK_FAM: read[read_number].fs_type = eAbisolidFSignalType_FAM; d = &read[read_number].fxx; break; case ABSOLID_FMT_COLMASK_CY3: d = &read[read_number].cy3; break; case ABSOLID_FMT_COLMASK_TXR: d = &read[read_number].txr; break; case ABSOLID_FMT_COLMASK_CY5: d = &read[read_number].cy5; break; } if( d ) { rc = pstring_assign(d, ztr.signal->data, ztr.signal->datasize); DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len)); } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signal", NULL); } } else { if( fe->region.nreads <= 0 || fe->region.nreads > ABSOLID_FMT_MAX_NUM_READS ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(fe->ctx.file, klogErr, rc, "read count $(c)", PLOG_U8(c), fe->region.nreads); } for(i = 0; rc == 0 && i < fe->region.nreads; i++) { pstring* d = NULL; int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads) ? ztr.signal->datasize : (fe->region.start[i + 1] * sizeof(float)); len -= fe->region.start[i] * sizeof(float); switch(stype) { case ABSOLID_FMT_COLMASK_FAM: read[read_number].fs_type = eAbisolidFSignalType_FAM; d = &read[read_number].fxx; break; case ABSOLID_FMT_COLMASK_CY3: d = &read[read_number].cy3; break; case ABSOLID_FMT_COLMASK_TXR: d = &read[read_number].txr; break; case ABSOLID_FMT_COLMASK_CY5: d = &read[read_number].cy5; break; } if( d ) { rc = pstring_assign(d, &ztr.signal->data[fe->region.start[i] * sizeof(float)], len); DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len)); } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); } } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signals", NULL); } } } } break; default: break; } if(type != none && type != ignore) { free(*(void **)&ztr); } } if(rc == 0) { if( read_type <= eAbisolidReadType_SPOT ) { rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], &read[1]); } else { switch( AbisolidReadType2ReadNumber[read_type] ) { case 0: rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], NULL); break; case 1: rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, NULL, &read[1]); break; default: rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(ctx->file, klogErr, rc, "more than 2 reads", NULL); break; } } } return rc; }