static rc_t read_spot_data_3lines(FastqFileInfo* file, FileReadData* sd, uint8_t best_word, uint8_t best_score, int qualType) { rc_t rc = 0; file->line = NULL; /* discard defline */ /* read sequence */ if( (rc = read_multiline_seq_or_qual(file, '+', &sd->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence data"); } if( !pstring_is_fasta(&sd->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected sequence data"); } /* next defline */ if( (rc = file_read_line(file, false)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline"); } if( file->line[0] != '+' ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '+' on quality defline"); } if( file->line_len != 1 ) { /* there may be just '+' on quality defline */ FileReadData d; uint8_t score = parse_spot_name(file->file, &d, &file->line[1], file->line_len - 1, best_word); /* sometimes quality defline may NOT contain barcode and readid, so score will be lower than bestscore, but must be at least == 1 with none empty line, which means that name was found */ if( score < 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not found"); } if( pstring_cmp(&sd->name, &d.name) != 0 || (score == best_score && (pstring_cmp(&sd->barcode, &d.barcode) != 0 || sd->read.read_id != d.read.read_id)) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality defline do not match sequence defline"); } } file->line = NULL; /* discard defline */ if( (rc = read_multiline_seq_or_qual(file, '@', &sd->read.qual)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=failed to read quality"); } if( sd->read.qual.len <= 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcEmpty); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality"); } sd->read.qual_type = qualType; sd->ready = true; return 0; }
/* * in a single line form tries to grab last to chunks defined by sep into seq and qual * ignores spaces adjucent to sep * normally line would look like "name sep seq sep sep qual" */ static bool find_seq_qual_by_sep(FastqLoaderFmt* self, FastqFileInfo* file, const char sep) { const char* seq = NULL, *qual = NULL; size_t seq_len = 0, qual_len = 0; FileReadData_init(file->spot, false); qual = memrchr(file->line, sep, file->line_len); if( qual != NULL ) { seq = memrchr(file->line, sep, qual - file->line); if( seq != NULL ) { if( parse_spot_name(file->file, file->spot, file->line, seq - file->line, 1) != 0 ) { /* skip leading spaces */ do { seq = seq + 1; } while( *seq == ' ' && seq < (file->line + file->line_len) ); seq_len = qual - seq; do { qual = qual + 1; } while( *qual == ' ' && qual < (file->line + file->line_len) ); qual_len = file->line_len - (qual - file->line); if( *seq != sep && *seq != ' ' && seq_len != 0 && *qual != sep && *qual != ' ' && qual_len != 0 ) { if( match_seq_to_qual(seq, seq_len, qual, qual_len) ) { rc_t rc; if( (rc = pstring_assign(&file->spot->read.seq, seq, seq_len)) == 0 ) { if( pstring_is_fasta(&file->spot->read.seq) ) { if( (rc = pstring_assign(&file->spot->read.qual, qual, qual_len)) == 0 ) { file->spot->read.qual_type = file->qualType; return true; } } file->spot->read.seq.len = 0; } if( rc != 0 ) { SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=storing read data"); } } } } } } return false; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file) { rc_t rc = 0; if( file->spot->ready ) { /* data still not used */ return 0; } FileReadData_init(file->spot, false); FileReadData_init(&file->spot[1], false); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) { /* single line forms */ file->line = NULL; /* line consumed */ file->spot->ready = true; } else if( file->line[0] == '>' || file->line[0] == '@' ) { /* 4 or 8 line format */ FileReadData sd; uint8_t word = 0, best_word = 0; uint8_t score = 0, best_score = 0; /* find and parse spot name on defline */ do { score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word); if( score > best_score ) { if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 || (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name"); } file->spot->read.read_id = sd.read.read_id; best_score = score; best_word = word; /* used below for quality defline parsing */ } } while(score != 0); if( best_score == 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected"); } if( file->line[0] == '@' ) { if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) { return rc; } /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */ if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) { /* try to find read id on next line */ FileReadData_init(&file->spot[1], false); if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) { if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 && pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 && file->spot->read.read_id != file->spot[1].read.read_id ) { /* since it is different read id with same name and barcode, fill up second read */ if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) { return rc; } } } } } else { /* 2 line seq or quality form */ file->line = NULL; /* line consumed */ /* read sequence/quality */ if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data"); } if( file->spot->read.seq.len == 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data"); } else if( !pstring_is_fasta(&file->spot->read.seq) ) { /* swap */ if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) { file->spot->read.qual_type = file->qualType; pstring_clear(&file->spot->read.seq); } } file->spot->ready = true; } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown"); } if( rc == 0 ) { int k; for(k = 0; k < 2; k++) { FileReadData* rd = &file->spot[k]; if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( file->qualOffset == 0 ) { /* detect and remember */ file->qualOffset = 33; file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); if( GetRCState(rc) == rcOutofrange ) { file->qualOffset = 64; file->qualMax = 61; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } } else { if(file->qualOffset == 33) file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality"); } } } } return 0; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file) { rc_t rc = 0; const char* tail = file->line; if( file->ready ) { /* data still not used */ return 0; } IlluminaFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } switch( file->type ) { case eIlluminaNativeFileTypeQSeq: if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq"); } break; case eIlluminaNativeFileTypeFasta: case eIlluminaNativeFileTypeNoise: case eIlluminaNativeFileTypeIntensity: case eIlluminaNativeFileTypeSignal: {{ /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */ if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) { if( blk_pfx != NULL ) { pstring tmp_name; if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 && (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 && (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_concat(&file->name, &tmp_name); } } } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord"); } break; }} case eIlluminaNativeFileTypeQuality4: if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4"); } break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line"); break; } /* process tail (after coords) for some file types */ file->line_len -= tail - file->line; /* length of tail */ switch( file->type ) { case eIlluminaNativeFileTypeQSeq: case eIlluminaNativeFileTypeQuality4: default: /* completely processed before */ break; case eIlluminaNativeFileTypeFasta: if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 || !pstring_is_fasta(&file->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta"); } break; case eIlluminaNativeFileTypeNoise: if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise"); } break; case eIlluminaNativeFileTypeIntensity: if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity"); } break; case eIlluminaNativeFileTypeSignal: if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal"); } break; } file->line = NULL; file->ready = true; #if _DEBUGGING DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, file->coord[0], file->coord[1], file->coord[2], file->coord[3])); if( file->read.seq.len ) { DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data)); } if( file->read.qual.len ) { DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len)); } #endif return 0; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file) { rc_t rc = 0; if( file->ready ) { /* data still not used */ return 0; } HelicosFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( file->line[0] == '@' ) { /*** fastq format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line[0] != '+' || file->line_len != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 || (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 || (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } file->line = NULL; file->ready = true; } else if( file->line[0] == '>' ) { /** fasta format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; file->quality.len = file->sequence.len; memset(file->quality.data,14,file->quality.len); file->ready = true; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'"); } #if _DEBUGGING DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len)); /* DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/ #endif return 0; }