static rc_t read_spot_coord(IlluminaFileInfo* file, const char* data, size_t data_sz, const char** tail) { rc_t rc = 0; const char* t, *str = data, *end = data + data_sz; int tabs = 0; if( tail ) { *tail = NULL; } do { if( (t = memchr(str, '\t', end - str)) != NULL ) { switch(++tabs) { case 1: errno = 0; file->coord[0] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[0] = 0; } rc = pstring_assign(&file->name, str, t - str); break; case 2: case 3: case 4: errno = 0; file->coord[tabs - 1] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[tabs - 1] = 0; } if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } if( tail ) { *tail = t + 1; } break; } str = ++t; } } while( rc == 0 && t != NULL && str < end && tabs < 4 ); if( tabs < 4 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcTooShort); } return rc; }
/* * read fasta or quality, which maybe wrapped on 70th column width, * into asciiZ buffer */ static rc_t read_multiline_seq_or_qual(FastqFileInfo* file, const char stop, pstring* str) { rc_t rc = 0; bool append = false, optional = false; while( rc == 0 ) { if( (rc = file_read_line(file, optional)) == 0 ) { if( optional && (file->line == NULL || (file->line_len > 0 && file->line[0] == stop)) ) { /* eof or next line is defline -> stop, line stays in buffer */ break; } if( append && memchr(str->data, ' ', str->len) != NULL ) { rc = pstring_append(str, " ", 1); } if( rc == 0 && (rc = pstring_append(str, file->line, file->line_len)) == 0 ) { file->line = NULL; /* line processed */ optional = true; } append = true; } } return rc; }
static rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId ) { rc_t rc; char *suffix; pstring readName, spotGroup; static IlluminaSpot spot; /* look for spot group */ suffix = strchr(readId->data, '#'); if( suffix != NULL ) { readId->len = suffix++ - readId->data; if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogInt, rc, "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data); return rc; } } else { pstring_clear(&spotGroup); } /* build the read name from prefix (self->name_prefix) and read id */ if(self->name_prefix.len > 0 ) { if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) { if( isdigit(readName.data[readName.len - 1]) ) { rc = pstring_append(&readName, ":", 1); } if( rc == 0 ) { rc = pstring_concat(&readName, readId); } } } else { rc = pstring_copy(&readName, readId); } if( rc != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "preparing spot name $(spotname)", "spotname=%s", readId->data); return rc; } SRF_set_read_filter(&self->read.filter, flags); IlluminaSpot_Init(&spot); if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) { rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot); } return rc; }
rc_t SRAWriteAbsolid_MakeName(const pstring* prefix, const pstring* suffix, pstring* name) { rc_t rc = 0; if( prefix == NULL || name == NULL ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcParam, rcNull); } else if( (rc = pstring_copy(name, prefix)) == 0 ) { if( suffix && suffix->len > 0 ) { if( name->len > 0 && name->data[name->len - 1] != '_' && suffix->data[0] != '_' ) { rc = pstring_append(name, "_", 1); } if( rc == 0 ) { pstring_concat(name, suffix); } } } if( rc != 0 ) { LOGERR(klogErr, rc, "preparing spot name"); } return rc; }
rc_t pstring_concat(pstring* dst, const pstring* src) { return pstring_append(dst, src->data, src->len); }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file) { rc_t rc = 0; const char* tail = file->line; if( file->ready ) { /* data still not used */ return 0; } IlluminaFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } switch( file->type ) { case eIlluminaNativeFileTypeQSeq: if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq"); } break; case eIlluminaNativeFileTypeFasta: case eIlluminaNativeFileTypeNoise: case eIlluminaNativeFileTypeIntensity: case eIlluminaNativeFileTypeSignal: {{ /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */ if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) { if( blk_pfx != NULL ) { pstring tmp_name; if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 && (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 && (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_concat(&file->name, &tmp_name); } } } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord"); } break; }} case eIlluminaNativeFileTypeQuality4: if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4"); } break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line"); break; } /* process tail (after coords) for some file types */ file->line_len -= tail - file->line; /* length of tail */ switch( file->type ) { case eIlluminaNativeFileTypeQSeq: case eIlluminaNativeFileTypeQuality4: default: /* completely processed before */ break; case eIlluminaNativeFileTypeFasta: if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 || !pstring_is_fasta(&file->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta"); } break; case eIlluminaNativeFileTypeNoise: if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise"); } break; case eIlluminaNativeFileTypeIntensity: if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity"); } break; case eIlluminaNativeFileTypeSignal: if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal"); } break; } file->line = NULL; file->ready = true; #if _DEBUGGING DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, file->coord[0], file->coord[1], file->coord[2], file->coord[3])); if( file->read.seq.len ) { DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data)); } if( file->read.qual.len ) { DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len)); } #endif return 0; }
/* * assumes tab separated file: * first 2 postiions concatinated with "_" into spot prefix * nextg 4 postiions concatinated with ":" into spot id: lane:tile:x:y * 7th (index) ignored * 8th is read id * 9th fasta * 10th quality * 11th (optional) read filter */ static rc_t parse_qseq(IlluminaFileInfo* file, const char* data, size_t data_sz) { rc_t rc = 0; const char* t, *str = data, *end = data + data_sz; int tabs = 0; do { if( (t = memchr(str, '\t', end - str)) != NULL ) { switch(++tabs) { case 1: rc = pstring_assign(&file->name, str, t - str); break; case 2: if( (rc = pstring_append(&file->name, "_", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 3: case 4: case 5: case 6: errno = 0; file->coord[tabs - 3] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[tabs - 3] = 0; } if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 7: if( t - str != 1 || (*str != '0' && *str != '1') ) { rc = pstring_assign(&file->barcode, str, t - str); } break; case 8: if( t - str != 1 || !isdigit(*str) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else { file->read.read_id = *str - '0'; if( file->read.read_id == 0 ) { file->read.read_id = ILLUMINAWRITER_READID_NONE; } } break; case 9: rc = pstring_assign(&file->read.seq, str, t - str); break; case 10: file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, t - str); break; } str = ++t; } } while( rc == 0 && t != NULL && str < end ); if( rc == 0 ) { if( tabs == 9 ) { file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, end - str); } else if( tabs == 10 ) { if( end - str != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else if( *str == '1' ) { file->read.filter = SRA_READ_FILTER_PASS; } else if( *str == '0' ) { file->read.filter = SRA_READ_FILTER_REJECT; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } if( rc == 0 ) { if( file->read.seq.len != file->read.qual.len ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); } else { rc = pstring_quality_convert(&file->read.qual, eExperimentQualityEncoding_Ascii, 64, 0, 0x7F); } } } return rc; }