rc_t pstring_copy(pstring* dst, const pstring* src) { if( src == NULL ) { return RC(rcSRA, rcFormatter, rcCopying, rcParam, rcNull); } return pstring_assign(dst, src->data, src->len); }
/* * in a single line form tries to grab last to chunks defined by sep into seq and qual * ignores spaces adjucent to sep * normally line would look like "name sep seq sep sep qual" */ static bool find_seq_qual_by_sep(FastqLoaderFmt* self, FastqFileInfo* file, const char sep) { const char* seq = NULL, *qual = NULL; size_t seq_len = 0, qual_len = 0; FileReadData_init(file->spot, false); qual = memrchr(file->line, sep, file->line_len); if( qual != NULL ) { seq = memrchr(file->line, sep, qual - file->line); if( seq != NULL ) { if( parse_spot_name(file->file, file->spot, file->line, seq - file->line, 1) != 0 ) { /* skip leading spaces */ do { seq = seq + 1; } while( *seq == ' ' && seq < (file->line + file->line_len) ); seq_len = qual - seq; do { qual = qual + 1; } while( *qual == ' ' && qual < (file->line + file->line_len) ); qual_len = file->line_len - (qual - file->line); if( *seq != sep && *seq != ' ' && seq_len != 0 && *qual != sep && *qual != ' ' && qual_len != 0 ) { if( match_seq_to_qual(seq, seq_len, qual, qual_len) ) { rc_t rc; if( (rc = pstring_assign(&file->spot->read.seq, seq, seq_len)) == 0 ) { if( pstring_is_fasta(&file->spot->read.seq) ) { if( (rc = pstring_assign(&file->spot->read.qual, qual, qual_len)) == 0 ) { file->spot->read.qual_type = file->qualType; return true; } } file->spot->read.seq.len = 0; } if( rc != 0 ) { SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=storing read data"); } } } } } } return false; }
static rc_t read_quality(const char* data, size_t data_sz, IlluminaRead* read) { rc_t rc = 0; if( (rc = pstring_assign(&read->qual, data, data_sz)) == 0 ) { if( (rc = pstring_quality_convert(&read->qual, eExperimentQualityEncoding_Decimal, 0, -128, 127)) == 0 ) { read->qual_type = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS4; } } return rc; }
static rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId ) { rc_t rc; char *suffix; pstring readName, spotGroup; static IlluminaSpot spot; /* look for spot group */ suffix = strchr(readId->data, '#'); if( suffix != NULL ) { readId->len = suffix++ - readId->data; if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogInt, rc, "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data); return rc; } } else { pstring_clear(&spotGroup); } /* build the read name from prefix (self->name_prefix) and read id */ if(self->name_prefix.len > 0 ) { if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) { if( isdigit(readName.data[readName.len - 1]) ) { rc = pstring_append(&readName, ":", 1); } if( rc == 0 ) { rc = pstring_concat(&readName, readId); } } } else { rc = pstring_copy(&readName, readId); } if( rc != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "preparing spot name $(spotname)", "spotname=%s", readId->data); return rc; } SRF_set_read_filter(&self->read.filter, flags); IlluminaSpot_Init(&spot); if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) { rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot); } return rc; }
static rc_t read_spot_coord(IlluminaFileInfo* file, const char* data, size_t data_sz, const char** tail) { rc_t rc = 0; const char* t, *str = data, *end = data + data_sz; int tabs = 0; if( tail ) { *tail = NULL; } do { if( (t = memchr(str, '\t', end - str)) != NULL ) { switch(++tabs) { case 1: errno = 0; file->coord[0] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[0] = 0; } rc = pstring_assign(&file->name, str, t - str); break; case 2: case 3: case 4: errno = 0; file->coord[tabs - 1] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[tabs - 1] = 0; } if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } if( tail ) { *tail = t + 1; } break; } str = ++t; } } while( rc == 0 && t != NULL && str < end && tabs < 4 ); if( tabs < 4 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcTooShort); } return rc; }
static rc_t set_label_type(const char* label, pstring* name, EAbisolidReadType* type) { rc_t rc = 0; assert(name && type); *type = AbsolidRead_Suffix2ReadType(label); if( *type == eAbisolidReadType_Unknown ) { DEBUG_MSG(3, ("read label is not recognized: '%s'\n", label)); } else { const char* l = AbisolidReadType2ReadLabel[*type]; rc = pstring_assign(name, l, strlen(l)); } return rc; }
static rc_t SFFLoaderFmtReadDataHeader(SFFLoaderFmt* self, const SRALoaderFile* file) { rc_t rc = 0; uint16_t head_sz = 0; /* Make sure the entire fixed portion of Read Header section is in the file buffer window */ if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFReadHeader_size, "read header", false)) != 0 ) { return rc; } memcpy(&self->read_header, self->file_buf, SFFReadHeader_size); #if __BYTE_ORDER == __LITTLE_ENDIAN self->read_header.header_length = bswap_16(self->read_header.header_length); self->read_header.name_length = bswap_16(self->read_header.name_length); self->read_header.number_of_bases = bswap_32(self->read_header.number_of_bases); self->read_header.clip_quality_left = bswap_16(self->read_header.clip_quality_left); self->read_header.clip_quality_right = bswap_16(self->read_header.clip_quality_right); self->read_header.clip_adapter_left = bswap_16(self->read_header.clip_adapter_left); self->read_header.clip_adapter_right = bswap_16(self->read_header.clip_adapter_right); #endif head_sz = SFFReadHeader_size + self->read_header.name_length; head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0; if( head_sz != self->read_header.header_length ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid); SRALoaderFile_LOG(file, klogErr, rc, "read header length $(h) != $(s)", PLOG_2(PLOG_U16(h),PLOG_U16(s)), self->header.header_length, head_sz); return rc; } /* read name */ self->file_advance = SFFReadHeader_size; if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFReadHeader_size, "read header", false)) != 0) { return rc; } self->file_advance = head_sz - SFFReadHeader_size; if( (rc = pstring_assign(&self->name, self->file_buf, self->read_header.name_length)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "copying read name", NULL); } return rc; }
static rc_t parse_read(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size) { rc_t rc = 0; size_t parsed; uint8_t flags; pstring readId; ztr_raw_t ztr_raw; ztr_t ztr; enum ztr_chunk_type type; fe_context_t* fe = (fe_context_t*)ctx; *(void **)&fe->sequence = *(void **)&fe->quality1 = *(void **)&fe->quality4 = *(void **)&fe->signal = *(void **)&fe->intensity = *(void **)&fe->noise = NULL; rc = SRF_ParseReadChunk(data, size, &parsed, &flags, &readId); if(rc) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rc); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL); } if(fe->defered != NULL) ZTR_AddToBuffer(ztr_ctx, fe->defered, fe->defered_len); ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed); if(fe->defered == NULL) { rc = ZTR_ParseBlock(ztr_ctx, &ztr_raw); if(rc == 0) goto PARSE_BLOCK; rc = ZTR_ParseHeader(ztr_ctx); if(rc) { return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL); } } while (!ZTR_BufferIsEmpty(ztr_ctx)) { rc = ZTR_ParseBlock(ztr_ctx, &ztr_raw); PARSE_BLOCK: if(rc != 0 || (rc = ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0 ) { return SRALoaderFile_LOG(ctx->file, klogErr, rc, "corrupt", NULL); } switch (type) { case READ: if(ztr.sequence->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for sequence data", NULL); } fe->sequence = ztr; break; case QUALITY1: if(ztr.quality1->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for quality1 data", NULL); } fe->quality1 = ztr; break; case QUALITY4: if(ztr.quality4->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "invalid data type for quality4 data", NULL); } fe->quality4 = ztr; break; case SIGNAL4: if(ztr.signal4->Type != NULL && strncmp(ztr.signal4->Type, "SLXI", 4) == 0 ) { if( !fe->skip_intensity ) { fe->intensity = ztr; } else if(ztr.signal4){ if(ztr.signal4->data) free(ztr.signal4->data); free(ztr.signal4); } } else if(ztr.signal4->Type != NULL && strncmp(ztr.signal4->Type, "SLXN", 4) == 0 ) { if( !fe->skip_noise ) { fe->noise = ztr; } else if(ztr.signal4){ if(ztr.signal4->data) free(ztr.signal4->data); free(ztr.signal4); } } else if( !fe->skip_signal ) { fe->signal = ztr; } else if(ztr.signal4){ if(ztr.signal4->data) free(ztr.signal4->data); free(ztr.signal4); } break; default: free(*(void **)&ztr); case none: case ignore: if(ztr_raw.data) { free(ztr_raw.data); } break; } if(ztr_raw.meta){ free(ztr_raw.meta); ztr_raw.meta=NULL; } } while(rc == 0) { if(*(void **)&fe->sequence == NULL) { rc = RC(rcSRA, rcFormatter, rcParsing, rcConstraint, rcViolated); SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing sequence data", NULL); break; } if(*(void **)&fe->quality4 == NULL && *(void **)&fe->quality1 == NULL) { rc = RC(rcSRA, rcFormatter, rcParsing, rcConstraint, rcViolated); SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing quality data", NULL); break; } if( (rc = ILL_ZTR_Decompress(ztr_ctx, BASE, fe->sequence, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.seq, fe->sequence.sequence->data, fe->sequence.sequence->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress sequence data", NULL); break; } if( *(void **)&fe->quality4 != NULL ) { if( (rc = ILL_ZTR_Decompress(ztr_ctx, CNF4, fe->quality4, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.qual, fe->quality4.quality4->data, fe->quality4.quality4->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress quality4 data", NULL); break; } fe->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS4; } else if( *(void **)&fe->quality1 != NULL ) { if( (rc = ILL_ZTR_Decompress(ztr_ctx, CNF1, fe->quality1, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.qual, fe->quality1.quality1->data, fe->quality1.quality4->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress quality1 data", NULL); break; } fe->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; } if( *(void **)&fe->signal != NULL ) { if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->signal, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.signal, fe->signal.signal4->data, fe->signal.signal4->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress signal data", NULL); break; } } if( *(void **)&fe->intensity != NULL ) { if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->intensity, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.intensity, fe->intensity.signal4->data, fe->intensity.signal4->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress intensity data", NULL); break; } } if( *(void **)&fe->noise != NULL ) { if( (rc = ILL_ZTR_Decompress(ztr_ctx, SMP4, fe->noise, fe->sequence)) != 0 || (rc = pstring_assign(&fe->read.noise, fe->noise.signal4->data, fe->noise.signal4->datasize)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "failed to decompress noise data", NULL); break; } } rc = fe_new_read(fe, flags, &readId); break; } if(fe->sequence.sequence) { if(fe->sequence.sequence->data) free(fe->sequence.sequence->data); free(fe->sequence.sequence); } if(fe->quality1.quality1) { if(fe->quality1.quality1->data) free(fe->quality1.quality1->data); free(fe->quality1.quality1); } if(fe->quality4.quality4) { if(fe->quality4.quality4->data) free(fe->quality4.quality4->data); free(fe->quality4.quality4); } if(fe->signal.signal4) { if(fe->signal.signal4->data) free(fe->signal.signal4->data); free(fe->signal.signal4); } if(fe->intensity.signal4) { if(fe->intensity.signal4->data) free(fe->intensity.signal4->data); free(fe->intensity.signal4); } if(fe->noise.signal4) { if(fe->noise.signal4->data) free(fe->noise.signal4->data); free(fe->noise.signal4); } return rc; }
/* parses name as a given word number (1-based) in a str of size len * looks for name(#barcode)?([\/.]\d)? * returns score of found parts * score == 0 word not found */ static uint8_t parse_spot_name(const SRALoaderFile* file, FileReadData* spot, const char* str, size_t len, uint8_t word_number) { uint8_t w, score = 0; const char* name, *name_end; name = name_end = str; /* set name_end to end of word_number-th word */ for(w = 1; w <= word_number || name_end == NULL; w++ ) { /* skip consecutive spaces */ while( *name_end == ' ' && name_end != &str[len] ) { name_end++; } name = name_end; name_end = memchr(name, ' ', len - (name_end - str)); if( name_end == NULL ) { if( w == word_number ) { name_end = &str[len]; } break; } } if( name != name_end && name_end != NULL ) { char* x; rc_t rc; /* init only name portion */ FileReadData_init(spot, true); --name_end; /* goto last char */ if( isdigit(name_end[0])&& (name_end[-1] == '\\' || name_end[-1] == '/' )) { score++; spot->read.read_id = name_end[0] - '0'; name_end -= 2; } else if( isdigit(*name_end) && name_end[-1] == '.' ) { int q = 0; if( memrchr(name, '#', name_end - name) != NULL ) { /* have barode -> this is read id */ q = 4; } else { /* may a read id, check to see if 4 coords follow */ const char* end = name_end - 1; while( --end >= name ) { if( strchr(":|_", *end) != NULL ) { q++; } else if( !isdigit(*end) ) { break; } } } if( q == 4 ) { score++; spot->read.read_id = name_end[0] - '0'; name_end -= 2; } } if( (x = memrchr(name, '#', name_end - name)) != NULL ) { score++; if( (rc = pstring_assign(&spot->barcode, x + 1, name_end - x)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "barcode $(b)", "b=%.*s", name_end - x, x + 1); return 0; } if( pstring_strcmp(&spot->barcode, "0") == 0 ) { pstring_clear(&spot->barcode); } else if( spot->barcode.len >= 4 && (strncmp(spot->barcode.data, "0/1_", 4) == 0 || strncmp(spot->barcode.data, "0/2_", 4) == 0) ) { spot->read.read_id = spot->barcode.data[2] - '0'; pstring_assign(&spot->barcode, &spot->barcode.data[4], spot->barcode.len - 4); } name_end = --x; } score++; if( (rc = pstring_assign(&spot->name, name, name_end - name + 1)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "spot name $(n)", "n=%.*s", name_end - name + 1, name); return 0; } /* search for _R\d\D in name and use it as read id, remove from name or spot won't assemble */ x = spot->name.data; while( (x = strrchr(x, 'R')) != NULL ) { if( x != spot->name.data && *(x - 1) == '_' && isdigit(*(x + 1)) && !isalnum(*(x + 2)) ) { score++; if(spot->read.read_id == -1){ spot->read.read_id = *(x + 1) - '0'; } strcpy(x - 1, x + 2); spot->name.len -= 4; break; } x++; } /* find last '=' and use only whatever is to the left of it */ if( (x = memrchr(spot->name.data, '=', spot->name.len)) != NULL ) { rc = pstring_assign(&spot->name, spot->name.data, (x - spot->name.data) ); } } return score; }
static rc_t IlluminaLoaderFmt_WriteData(IlluminaLoaderFmt* self, uint32_t argc, const SRALoaderFile* const argv[], int64_t* spots_bad_count) { rc_t rc = 0; uint32_t t, i, k, ftype_q = sizeof(file_types) / sizeof(file_types[0]); SLList files; IlluminaFileInfo* file = NULL; SLListInit(&files); /* group files using spotname, for _prb. file name prefix is used, files reviewed by type detected from name and ordered by file_type array */ for(t = 0; rc == 0 && t < ftype_q; t++) { for(i = 0; rc == 0 && i < argc; i++) { const char* fname, *blk_pfx; int prefix_len = 0; ERunFileType ftype; EIlluminaNativeFileType type = eIlluminaNativeFileTypeNotSet; FGroup_Find_data data; if( (rc = SRALoaderFileName(argv[i], &fname)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file name", NULL); break; } if( (rc = SRALoaderFile_FileType(argv[i], &ftype)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading file type", NULL); break; } if( (rc = SRALoaderFileBlockName(argv[i], &blk_pfx)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "reading DATA_BLOCK/@name", NULL); break; } if( blk_pfx == NULL ) { blk_pfx = ""; } {{ /* skip path if present */ const char* p = strrchr(fname, '/'); fname = p ? p + 1 : fname; p = NULL; for(k = 0; type == eIlluminaNativeFileTypeNotSet && k < ftype_q; k++) { const char* const* e = file_types[k].key; while( *e != NULL ) { p = strstr(fname, *e++); if( p != NULL ) { type = file_types[k].type; break; } } } if( p != NULL ) { prefix_len = p - fname; } }} if( ftype == rft_IlluminaNativeSeq ) { type = eIlluminaNativeFileTypeFasta; } else if( ftype == rft_IlluminaNativePrb ) { type = eIlluminaNativeFileTypeQuality4; } else if( ftype == rft_IlluminaNativeInt ) { type = eIlluminaNativeFileTypeIntensity; } else if( ftype == rft_IlluminaNativeQseq ) { type = eIlluminaNativeFileTypeQSeq; } if( type == eIlluminaNativeFileTypeNotSet ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); SRALoaderFile_LOG(argv[i], klogErr, rc, "detecting file type by file name", NULL); break; } if( type != file_types[t].type ) { /* one type at a time */ continue; } DEBUG_MSG(3, ("file '%s' type set to %d\n", fname, type)); file = calloc(1, sizeof(*file)); if( file == NULL ) { rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcExhausted); SRALoaderFile_LOG(argv[i], klogErr, rc, "allocating file object", NULL); break; } IlluminaFileInfo_init(file); file->file = argv[i]; file->type = type; if( file->type == eIlluminaNativeFileTypeQuality4 ) { /* in _prb there is no spotname inside so use file prefix */ rc = pstring_assign(&data.key, fname, prefix_len); } else { /* try to get 1st spot so group can be organized by spot name */ if( (rc = read_next_spot(blk_pfx, file)) != 0 || !file->ready ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcNotFound); SRALoaderFile_LOG(argv[i], klogErr, rc, "reading 1st spot", NULL); break; } rc = pstring_copy(&data.key, &file->name); } data.found = NULL; if( SLListDoUntil(&files, FGroup_Find, &data) && data.found != NULL ) { IlluminaFileInfo* ss = data.found->files; while( rc == 0 && file != NULL ) { if( ss->type != eIlluminaNativeFileTypeQSeq && ss->type == file->type ) { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcDuplicate); SRALoaderFile_LOG(argv[i], klogErr, rc, "type of file for lane", NULL); } else if( ss->next != NULL ) { ss = ss->next; } else { ss->next = file; file->prev = ss; data.found->mask |= file->type; file = NULL; } } } else { data.found = calloc(1, sizeof(*data.found)); if( data.found == NULL ) { rc = RC(rcSRA, rcFormatter, rcReading, rcMemory, rcInsufficient); SRALoaderFile_LOG(argv[i], klogErr, rc, "preparing file group", NULL); break; } else { if( (rc = pstring_assign(&data.found->key, fname, prefix_len)) != 0 ) { SRALoaderFile_LOG(argv[i], klogErr, rc, "setting file group key", NULL); FGroup_Whack(&data.found->dad, NULL); break; } else { FGroup* curr = (FGroup*)SLListHead(&files), *prev = NULL; data.found->blk_pfx = blk_pfx; data.found->files = file; data.found->mask = file->type; /* group inserted into list by coords in 1st spot */ while( curr != NULL ) { if( curr->files[0].coord[0] > file->coord[0] || (curr->files[0].coord[0] == file->coord[0] && curr->files[0].coord[1] > file->coord[1]) ) { data.found->dad.next = &curr->dad; if( prev == NULL ) { files.head = &data.found->dad; } else { prev->dad.next = &data.found->dad; } break; } prev = curr; curr = (FGroup*)curr->dad.next; } if( curr == NULL ) { SLListPushTail(&files, &data.found->dad); } file = NULL; } } } } } if( rc == 0 ) { SLListForEach(&files, FGroup_Validate, &rc); } if( rc == 0 ) { FGroup_Parse_data data; data.self = self; if( SLListDoUntil(&files, FGroup_Parse, &data) ) { rc = data.rc; } } else { free(file); } SLListWhack(&files, FGroup_Whack, NULL); *spots_bad_count = self->spots_bad_count; return rc; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file) { rc_t rc = 0; const char* tail = file->line; if( file->ready ) { /* data still not used */ return 0; } IlluminaFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } switch( file->type ) { case eIlluminaNativeFileTypeQSeq: if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq"); } break; case eIlluminaNativeFileTypeFasta: case eIlluminaNativeFileTypeNoise: case eIlluminaNativeFileTypeIntensity: case eIlluminaNativeFileTypeSignal: {{ /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */ if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) { if( blk_pfx != NULL ) { pstring tmp_name; if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 && (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 && (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_concat(&file->name, &tmp_name); } } } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord"); } break; }} case eIlluminaNativeFileTypeQuality4: if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4"); } break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line"); break; } /* process tail (after coords) for some file types */ file->line_len -= tail - file->line; /* length of tail */ switch( file->type ) { case eIlluminaNativeFileTypeQSeq: case eIlluminaNativeFileTypeQuality4: default: /* completely processed before */ break; case eIlluminaNativeFileTypeFasta: if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 || !pstring_is_fasta(&file->read.seq) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta"); } break; case eIlluminaNativeFileTypeNoise: if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise"); } break; case eIlluminaNativeFileTypeIntensity: if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity"); } break; case eIlluminaNativeFileTypeSignal: if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal"); } break; } file->line = NULL; file->ready = true; #if _DEBUGGING DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, file->coord[0], file->coord[1], file->coord[2], file->coord[3])); if( file->read.seq.len ) { DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data)); } if( file->read.qual.len ) { DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len)); } #endif return 0; }
/* * assumes tab separated file: * first 2 postiions concatinated with "_" into spot prefix * nextg 4 postiions concatinated with ":" into spot id: lane:tile:x:y * 7th (index) ignored * 8th is read id * 9th fasta * 10th quality * 11th (optional) read filter */ static rc_t parse_qseq(IlluminaFileInfo* file, const char* data, size_t data_sz) { rc_t rc = 0; const char* t, *str = data, *end = data + data_sz; int tabs = 0; do { if( (t = memchr(str, '\t', end - str)) != NULL ) { switch(++tabs) { case 1: rc = pstring_assign(&file->name, str, t - str); break; case 2: if( (rc = pstring_append(&file->name, "_", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 3: case 4: case 5: case 6: errno = 0; file->coord[tabs - 3] = strtol(str, NULL, 10); if( errno != 0 ) { file->coord[tabs - 3] = 0; } if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) { rc = pstring_append(&file->name, str, t - str); } break; case 7: if( t - str != 1 || (*str != '0' && *str != '1') ) { rc = pstring_assign(&file->barcode, str, t - str); } break; case 8: if( t - str != 1 || !isdigit(*str) ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else { file->read.read_id = *str - '0'; if( file->read.read_id == 0 ) { file->read.read_id = ILLUMINAWRITER_READID_NONE; } } break; case 9: rc = pstring_assign(&file->read.seq, str, t - str); break; case 10: file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, t - str); break; } str = ++t; } } while( rc == 0 && t != NULL && str < end ); if( rc == 0 ) { if( tabs == 9 ) { file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED; rc = pstring_assign(&file->read.qual, str, end - str); } else if( tabs == 10 ) { if( end - str != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } else if( *str == '1' ) { file->read.filter = SRA_READ_FILTER_PASS; } else if( *str == '0' ) { file->read.filter = SRA_READ_FILTER_REJECT; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } if( rc == 0 ) { if( file->read.seq.len != file->read.qual.len ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent); } else { rc = pstring_quality_convert(&file->read.qual, eExperimentQualityEncoding_Ascii, 64, 0, 0x7F); } } } return rc; }
static rc_t SFFLoaderFmtReadCommonHeader(SFFLoaderFmt* self, const SRALoaderFile *file) { rc_t rc = 0; bool skiped_idx_pad = false; uint16_t head_sz; SFFCommonHeader prev_head; pstring prev_flow_chars; pstring prev_key_seq; if( (rc = SRALoaderFile_Offset(file, &self->index_correction)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "Reading initial file position", NULL); return rc; } SkipIndexPad: self->index_correction += self->file_advance; if( (rc = SFFLoaderFmt_ReadBlock(self, file, SFFCommonHeader_size, NULL, true)) != 0) { SRALoaderFile_LOG(file, klogErr, rc, "common header, needed $(needed) bytes", PLOG_U32(needed), SFFCommonHeader_size); return rc; } if( self->header.magic_number != 0 ) { /* next file in stream, remember prev to sync to each */ memcpy(&prev_head, &self->header, sizeof(SFFCommonHeader)); pstring_copy(&prev_flow_chars, &self->flow_chars); pstring_copy(&prev_key_seq, &self->key_seq); } else { prev_head.magic_number = 0; prev_head.index_length = 0; } memcpy(&self->header, self->file_buf, SFFCommonHeader_size); #if __BYTE_ORDER == __LITTLE_ENDIAN self->header.magic_number = bswap_32(self->header.magic_number); self->header.version = bswap_32(self->header.version); self->header.index_offset = bswap_64(self->header.index_offset); self->header.index_length = bswap_32(self->header.index_length); self->header.number_of_reads = bswap_32(self->header.number_of_reads); self->header.header_length = bswap_16(self->header.header_length); self->header.key_length = bswap_16(self->header.key_length); self->header.num_flows_per_read = bswap_16(self->header.num_flows_per_read); #endif if( self->header.magic_number != (('.'<<24)|('s'<<16)|('f'<<8)|('f'<<0)) ) { if( !skiped_idx_pad && prev_head.magic_number != 0 ) { /* possible concatination of 2 files with index at EOF and padded to 8 bytes with header values not padded, try skipping padding and reread */ uint32_t pad = 8 - prev_head.index_length % 8; if( pad != 0 ) { self->file_advance += pad; DEBUG_MSG(5, ("%s: trying to skip over %u bytes index section padding\n", self->file_name, pad)); skiped_idx_pad = true; goto SkipIndexPad; } } rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); SRALoaderFile_LOG(file, klogErr, rc, "magic number: $(m)", PLOG_U32(m), self->header.magic_number); return rc; } if( self->header.version != 1 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcBadVersion); SRALoaderFile_LOG(file, klogErr, rc, "format version $(v)", PLOG_U32(v), self->header.version); return rc; } if( self->header.flowgram_format_code != SFFFormatCodeUI16Hundreths ) { /* NOTE: add a case here if flowgram coding gets new version to support different */ rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(file, klogErr, rc, "common header flowgram format code", NULL); return rc; } if( self->header.index_length % 8 != 0 ) { DEBUG_MSG(5, ("%s: index_length field value is not 8 byte padded: %u\n", self->file_name, self->header.index_length)); } head_sz = SFFCommonHeader_size + self->header.num_flows_per_read + self->header.key_length; head_sz += (head_sz % 8) ? (8 - (head_sz % 8)) : 0; if( head_sz != self->header.header_length ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcFormat, rcInvalid); SRALoaderFile_LOG(file, klogErr, rc, "header length $(h) <> $(s) ", PLOG_2(PLOG_U16(h),PLOG_U16(s)), self->header.header_length, head_sz); return rc; } /* read flow chars and key */ self->file_advance = SFFCommonHeader_size; if( (rc = SFFLoaderFmt_ReadBlock(self, file, head_sz - SFFCommonHeader_size, "common header", false)) != 0) { return rc; } self->file_advance = head_sz - SFFCommonHeader_size; if( (rc = pstring_assign(&self->flow_chars, self->file_buf, self->header.num_flows_per_read)) != 0 || (rc = pstring_assign(&self->key_seq, self->file_buf + self->header.num_flows_per_read, self->header.key_length)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "reading flows/key sequence", NULL); return rc; } if( prev_head.magic_number != 0 ) { /* next file's common header must match previous file's common header, partially */ if( prev_head.key_length != self->header.key_length || prev_head.num_flows_per_read != self->header.num_flows_per_read || pstring_cmp(&prev_flow_chars, &self->flow_chars) != 0 || pstring_cmp(&prev_key_seq, &self->key_seq) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInconsistent); SRALoaderFile_LOG(file, klogErr, rc, "previous file common header differ in flows/key sequence", NULL); } } if( rc == 0 ) { if( self->w454 ) { rc = SRAWriter454_WriteHead(self->w454, &self->flow_chars, &self->key_seq); } else { rc = SRAWriterIonTorrent_WriteHead(self->wIonTorrent, &self->flow_chars, &self->key_seq); } } return rc; }
static rc_t SFFLoaderFmtReadData(SFFLoaderFmt* self, const SRALoaderFile* file) { rc_t rc = 0; uint32_t i; /* calc signal chunk size */ size_t signal_sz = self->header.num_flows_per_read * sizeof(uint16_t); /* plus position, read, quality */ size_t sz = signal_sz + self->read_header.number_of_bases * 3; /* + padding */ sz += (sz % 8) ? (8 - (sz % 8)) : 0; /* adjust the buffer window to full data block size */ if( (rc = SFFLoaderFmt_ReadBlock(self, file, sz, "read data", false)) != 0 ) { return rc; } self->file_advance = sz; if( !self->skip_signal ) { rc = pstring_assign(&self->signal, self->file_buf, signal_sz); #if __BYTE_ORDER == __LITTLE_ENDIAN if( rc == 0 ) { uint16_t* sig = (uint16_t*)self->signal.data; for(i = 0; i < self->header.num_flows_per_read; i++) { sig[i] = bswap_16(sig[i]); } } #endif } if( rc == 0 ) { const uint8_t* pos = self->file_buf + signal_sz; if( !self->skip_signal ) { INSDC_coord_one *p; /* reset buffer to proper size */ pstring_clear(&self->position); rc = pstring_append_chr(&self->position, 0, self->read_header.number_of_bases * sizeof(*p)); p = (INSDC_coord_one*)&self->position.data[0]; p[0] = pos[0]; for(i = 1; i < self->read_header.number_of_bases; i++) { p[i] = p[i - 1] + pos[i]; } } if( rc == 0 ) { pos += self->read_header.number_of_bases; rc = pstring_assign(&self->read, pos, self->read_header.number_of_bases); /*for(i = 0; i< self->read.len; i++ ) { self->read.data[i] = tolower(self->read.data[i]); }*/ } if( rc == 0 ) { pos += self->read_header.number_of_bases; rc = pstring_assign(&self->quality, pos, self->read_header.number_of_bases); } } if( rc != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "copying read data", NULL); } return rc; }
static rc_t parse_v1_read(SRF_context *ctx, ZTR_Context *ztr_ctx, const uint8_t *data, size_t size) { rc_t rc = 0; size_t i, parsed; ztr_raw_t ztr_raw; ztr_t ztr; enum ztr_chunk_type type; fe_context_t* fe = (fe_context_t*)ctx; uint8_t flags; pstring readId; EAbisolidReadType read_type; pstring label; AbsolidRead read[ABSOLID_FMT_MAX_NUM_READS]; if( fe->region.nreads == 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcNotFound); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "missing region chunk before 1st read chunk", NULL); } if( (rc = SRF_ParseReadChunk(data, size, &parsed, &flags, &readId)) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rc); return SRALoaderFile_LOG(ctx->file, klogErr, rc, "SRF parsing failure", NULL); } ABI_ZTR_AddToBuffer(ztr_ctx, data + parsed, size - parsed); /* readId will have spotname */ if( (rc = fe_new_read(fe, &readId, &read_type, &label)) != 0 ) { return SRALoaderFile_LOG(ctx->file, klogErr, rc, "parsing spot name suffix", NULL); } for(i = 0; i < sizeof(read) / sizeof(read[0]); i++) { AbsolidRead_Init(&read[i]); } while(!ABI_ZTR_BufferIsEmpty(ztr_ctx)) { if( (rc = ABI_ZTR_ParseBlock(ztr_ctx, &ztr_raw)) != 0 || (rc = ABI_ZTR_ProcessBlock(ztr_ctx, &ztr_raw, &ztr, &type)) != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "ZTR parsing failure", NULL); break; } switch (type) { case BASE: if(ztr.sequence->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "read: expected 8-bit datatype", NULL); } else if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; if( (rc = pstring_assign(&read[read_number].seq, ztr.sequence->data, ztr.sequence->datasize)) == 0 ) { /* grab 1st, may be the only cs_key */ read[read_number].cs_key = fe->region.cs_key[0]; for(i = 1; i < fe->region.nreads; i++) { if( read_type == fe->region.type[i] ) { read[read_number].cs_key = fe->region.cs_key[i]; break; } } SRF_set_read_filter(&read[read_number].filter, flags); rc = pstring_copy(&read[read_number].label, &label); DEBUG_MSG(3, ("SRF READ: '%s'\n", read[read_number].seq.data)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying read", NULL); } } else { for(i = 0; rc == 0 && i < fe->region.nreads; i++) { int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads ? ztr.sequence->datasize : fe->region.start[i + 1]) - fe->region.start[i]; rc = pstring_assign(&read[read_number].seq, &ztr.sequence->data[fe->region.start[i]], len); read[read_number].cs_key = fe->region.cs_key[i]; SRF_set_read_filter(&read[read_number].filter, flags); if( fe->region.label[i].len != 0 ) { rc = pstring_copy(&read[read_number].label, &fe->region.label[i]); } DEBUG_MSG(3, ("SRF READ[%u]: '%s'\n", i, read[read_number].seq.data)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying reads", NULL); } } break; case CNF1: if(ztr.quality1->datatype != i8) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "quality: expected 8-bit datatype", NULL); } else if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; if( (rc = pstring_assign(&read[read_number].qual, ztr.quality1->data, ztr.quality1->datasize)) == 0 ) { DEBUG_MSG(3, ("SRF QUAL: %u bytes\n", read[read_number].qual.len)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying quality", NULL); } } else { for(i = 0; rc == 0 && i < fe->region.nreads; i++) { int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads ? ztr.quality1->datasize : fe->region.start[i + 1]) - fe->region.start[i]; rc = pstring_assign(&read[read_number].qual, &ztr.quality1->data[fe->region.start[i]], len); DEBUG_MSG(3, ("SRF QUAL[%u]: %u bytes\n", i, read[read_number].qual.len)); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying qualities", NULL); } } break; case SAMP: if( !fe->skip_signal ) { size_t i; int stype = ABSOLID_FMT_COLMASK_NOTSET; if(ztr.signal->datatype != f32) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: expected 32-bit float datatype", NULL); } else if( (ztr.signal->datasize % sizeof(float)) != 0 ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcInvalid); SRALoaderFile_LOG(ctx->file, klogErr, rc, "signal: size not 32-bit float aligned", NULL); } else if (ztr.signal->channel == NULL) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcIncomplete); SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: missing channel type", NULL); } else if(strcmp(ztr.signal->channel, "0FAM") == 0) { stype = ABSOLID_FMT_COLMASK_FAM; } else if(strcmp(ztr.signal->channel, "1CY3") == 0) { stype = ABSOLID_FMT_COLMASK_CY3; } else if(strcmp(ztr.signal->channel, "2TXR") == 0) { stype = ABSOLID_FMT_COLMASK_TXR; } else if(strcmp(ztr.signal->channel, "3CY5") == 0) { stype = ABSOLID_FMT_COLMASK_CY5; } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnexpected); SRALoaderFile_LOG(ctx->file, klogErr, rc, "SIGNAL column: unexpected channel type", NULL); } #if __BYTE_ORDER == __LITTLE_ENDIAN for(i = 0; rc == 0 && i < ztr.signal->datasize; i += 4) { uint32_t* r = (uint32_t*)&ztr.signal->data[i]; *r = bswap_32(*r); } #endif if( rc == 0 ) { if( read_type > eAbisolidReadType_SPOT ) { int read_number = AbisolidReadType2ReadNumber[read_type]; pstring* d = NULL; switch(stype) { case ABSOLID_FMT_COLMASK_FAM: read[read_number].fs_type = eAbisolidFSignalType_FAM; d = &read[read_number].fxx; break; case ABSOLID_FMT_COLMASK_CY3: d = &read[read_number].cy3; break; case ABSOLID_FMT_COLMASK_TXR: d = &read[read_number].txr; break; case ABSOLID_FMT_COLMASK_CY5: d = &read[read_number].cy5; break; } if( d ) { rc = pstring_assign(d, ztr.signal->data, ztr.signal->datasize); DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len)); } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signal", NULL); } } else { if( fe->region.nreads <= 0 || fe->region.nreads > ABSOLID_FMT_MAX_NUM_READS ) { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(fe->ctx.file, klogErr, rc, "read count $(c)", PLOG_U8(c), fe->region.nreads); } for(i = 0; rc == 0 && i < fe->region.nreads; i++) { pstring* d = NULL; int read_number = AbisolidReadType2ReadNumber[fe->region.type[i]]; size_t len = (i + 1 >= fe->region.nreads) ? ztr.signal->datasize : (fe->region.start[i + 1] * sizeof(float)); len -= fe->region.start[i] * sizeof(float); switch(stype) { case ABSOLID_FMT_COLMASK_FAM: read[read_number].fs_type = eAbisolidFSignalType_FAM; d = &read[read_number].fxx; break; case ABSOLID_FMT_COLMASK_CY3: d = &read[read_number].cy3; break; case ABSOLID_FMT_COLMASK_TXR: d = &read[read_number].txr; break; case ABSOLID_FMT_COLMASK_CY5: d = &read[read_number].cy5; break; } if( d ) { rc = pstring_assign(d, &ztr.signal->data[fe->region.start[i] * sizeof(float)], len); DEBUG_MSG(3, ("SRF SIGNAL[%s]: %u bytes\n", ztr.signal->channel, d->len)); } else { rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnrecognized); } } if( rc != 0 ) { SRALoaderFile_LOG(ctx->file, klogErr, rc, "copying signals", NULL); } } } } break; default: break; } if(type != none && type != ignore) { free(*(void **)&ztr); } } if(rc == 0) { if( read_type <= eAbisolidReadType_SPOT ) { rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], &read[1]); } else { switch( AbisolidReadType2ReadNumber[read_type] ) { case 0: rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, &read[0], NULL); break; case 1: rc = SRAWriteAbsolid_Write(fe->writer, ctx->file, &readId, NULL, NULL, &read[1]); break; default: rc = RC(rcSRA, rcFormatter, rcParsing, rcData, rcUnsupported); SRALoaderFile_LOG(ctx->file, klogErr, rc, "more than 2 reads", NULL); break; } } } return rc; }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file) { rc_t rc = 0; if( file->ready ) { /* data still not used */ return 0; } HelicosFileInfo_init(file); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( file->line[0] == '@' ) { /*** fastq format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line[0] != '+' || file->line_len != 1 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 || (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 || (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality"); } file->line = NULL; file->ready = true; } else if( file->line[0] == '>' ) { /** fasta format **/ if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name"); } file->line = NULL; if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 || (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 || !pstring_is_fasta(&file->sequence) ) { rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence"); } file->line = NULL; file->quality.len = file->sequence.len; memset(file->quality.data,14,file->quality.len); file->ready = true; } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'"); } #if _DEBUGGING DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len)); /* DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data, file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/ #endif return 0; }