static rc_t read_quality(const char* data, size_t data_sz, IlluminaRead* read) { rc_t rc = 0; if( (rc = pstring_assign(&read->qual, data, data_sz)) == 0 ) { if( (rc = pstring_quality_convert(&read->qual, eExperimentQualityEncoding_Decimal, 0, -128, 127)) == 0 ) { read->qual_type = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS4; } } return rc; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file) { rc_t rc = 0; if( file->spot->ready ) { /* data still not used */ return 0; } FileReadData_init(file->spot, false); FileReadData_init(&file->spot[1], false); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) { /* single line forms */ file->line = NULL; /* line consumed */ file->spot->ready = true; } else if( file->line[0] == '>' || file->line[0] == '@' ) { /* 4 or 8 line format */ FileReadData sd; uint8_t word = 0, best_word = 0; uint8_t score = 0, best_score = 0; /* find and parse spot name on defline */ do { score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word); if( score > best_score ) { if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 || (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name"); } file->spot->read.read_id = sd.read.read_id; best_score = score; best_word = word; /* used below for quality defline parsing */ } } while(score != 0); if( best_score == 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected"); } if( file->line[0] == '@' ) { if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) { return rc; } /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */ if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) { /* try to find read id on next line */ FileReadData_init(&file->spot[1], false); if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) { if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 && pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 && file->spot->read.read_id != file->spot[1].read.read_id ) { /* since it is different read id with same name and barcode, fill up second read */ if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) { return rc; } } } } } else { /* 2 line seq or quality form */ file->line = NULL; /* line consumed */ /* read sequence/quality */ if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data"); } if( file->spot->read.seq.len == 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data"); } else if( !pstring_is_fasta(&file->spot->read.seq) ) { /* swap */ if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) { file->spot->read.qual_type = file->qualType; pstring_clear(&file->spot->read.seq); } } file->spot->ready = true; } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown"); } if( rc == 0 ) { int k; for(k = 0; k < 2; k++) { FileReadData* rd = &file->spot[k]; if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( file->qualOffset == 0 ) { /* detect and remember */ file->qualOffset = 33; file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); if( GetRCState(rc) == rcOutofrange ) { file->qualOffset = 64; file->qualMax = 61; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } } else { if(file->qualOffset == 33) file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality"); } } } } return 0; }
/* * assumes tab separated file: * first 2 postiions concatinated with "_" into spot prefix * nextg 4 postiions concatinated with ":" into spot id: lane:tile:x:y * 7th (index) ignored * 8th is read id * 9th fasta * 10th quality * 11th (optional) read filter */ static rc_t parse_qseq(IlluminaFileInfo* file, const char* data, size_t data_sz) { rc_t rc = 0; const char* t, *str = data, *end = data + data_sz; int tabs = 0; do { if( (t = memchr(str, '\t', end - str)) != NULL ) { switch(++tabs) { case 1: rc = pstring_assign(&file->name, str, t - str); break; case 2: if( (rc = pstring_append(&file->name, "_", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 3: case 4: case 5: case 6: errno = 0; file->coord[tabs - 3] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[tabs - 3] = 0; } if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 7: if( t - str != 1 || (*str != '0' && *str != '1') ) { rc = pstring_assign(&file->barcode, str, t - str); } break; case 8: if( t - str != 1 || !isdigit(*str) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else { file->read.read_id = *str - '0'; if( file->read.read_id == 0 ) { file->read.read_id = ILLUMINAWRITER_READID_NONE; } } break; case 9: rc = pstring_assign(&file->read.seq, str, t - str); break; case 10: file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, t - str); break; } str = ++t; } } while( rc == 0 && t != NULL && str < end ); if( rc == 0 ) { if( tabs == 9 ) { file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, end - str); } else if( tabs == 10 ) { if( end - str != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else if( *str == '1' ) { file->read.filter = SRA_READ_FILTER_PASS; } else if( *str == '0' ) { file->read.filter = SRA_READ_FILTER_REJECT; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } if( rc == 0 ) { if( file->read.seq.len != file->read.qual.len ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); } else { rc = pstring_quality_convert(&file->read.qual, eExperimentQualityEncoding_Ascii, 64, 0, 0x7F); } } } return rc; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file) { rc_t rc = 0; if( file->ready ) { /* data still not used */ return 0; } HelicosFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( file->line[0] == '@' ) { /*** fastq format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line[0] != '+' || file->line_len != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 || (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 || (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } file->line = NULL; file->ready = true; } else if( file->line[0] == '>' ) { /** fasta format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; file->quality.len = file->sequence.len; memset(file->quality.data,14,file->quality.len); file->ready = true; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'"); } #if _DEBUGGING DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len)); /* DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/ #endif return 0; }