static void HelicosFileInfo_init(HelicosFileInfo* file) { file->ready = false; pstring_clear(&file->name); pstring_clear(&file->sequence); pstring_clear(&file->quality); }
static void FileReadData_init(FileReadData* read, bool name_only) { pstring_clear(&read->name); pstring_clear(&read->barcode); IlluminaRead_Init(&read->read, name_only); if( !name_only ) { read->ready = false; } }
void IlluminaRead_Init(IlluminaRead* read, bool name_only) { assert(read != NULL); read->read_id = ILLUMINAWRITER_READID_NONE; if( !name_only ) { pstring_clear(&read->seq); read->qual_type = ILLUMINAWRITER_COLMASK_NOTSET; pstring_clear(&read->qual); pstring_clear(&read->noise); pstring_clear(&read->intensity); pstring_clear(&read->signal); read->filter = SRA_READ_FILTER_PASS; } }
static rc_t fe_new_read(fe_context_t *self, pstring *readId, EAbisolidReadType* type, pstring* label) { rc_t rc = 0; pstring name_suffix; const char* p; assert(self && readId && type && label); DEBUG_MSG(3, ("READ_LABEL: '%s'\n", readId->data)); /* spot name suffix may end with '_(F|R).+' */ p = strrchr(readId->data, '_'); if( p != NULL ) { rc = set_label_type(p + 1, label, type); if( rc == 0 && *type > eAbisolidReadType_SPOT) { /* cut label */ readId->len -= label->len + 1; } } else { pstring_clear(label); *type = eAbisolidReadType_SPOT; } if( rc == 0 && (rc = pstring_copy(&name_suffix, readId)) == 0 ) { rc = SRAWriteAbsolid_MakeName(&self->name_prefix, &name_suffix, readId); } return rc; }
rc_t pstring_quality_convert(pstring* qstr, ExperimentQualityEncoding enc, const uint8_t offset, const int8_t min, const int8_t max) { rc_t rc = 0; char* c, *end, *next; pstring qbin; if( qstr == NULL || min > max ) { rc = RC(rcSRA, rcFormatter, rcReading, rcParam, rcInvalid); } errno = 0; c = qstr->data; end = qstr->data + qstr->len; pstring_clear(&qbin); if(enc == eExperimentQualityEncoding_Undefined) { if(memchr(c, ' ', qstr->len) != NULL || memchr(c, '\t', qstr->len) != NULL){ enc = eExperimentQualityEncoding_Decimal; } else { enc = eExperimentQualityEncoding_Ascii; } } while( rc == 0 && c < end ) { long q; switch(enc) { case eExperimentQualityEncoding_Decimal: case eExperimentQualityEncoding_Hexadecimal: /* spaced numbers form */ errno = 0; q = strtol(c, &next, enc == eExperimentQualityEncoding_Decimal ? 10 : 16); if( q == 0 && c == next ) { /* no more digits in line */ goto DONE; /*** need do break while loop as well ***/ } c = next; break; case eExperimentQualityEncoding_Ascii: /* textual form with offset */ q = (long)(*c++) - offset; break; default: rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized); break; } if( rc == 0 ) { if( errno != 0 || q < min || q > max ) { rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcOutofrange); } else { rc = pstring_append_chr(&qbin, (int8_t)q, 1); } } } DONE: if( rc == 0 ) { rc = pstring_copy(qstr, &qbin); } return rc; }
static void IlluminaFileInfo_init(IlluminaFileInfo* file) { assert(file); file->ready = false; pstring_clear(&file->name); memset(file->coord, 0, sizeof(file->coord)); IlluminaRead_Init(&file->read, false); }
static rc_t fe_new_read(fe_context_t *self, int flags, pstring *readId ) { rc_t rc; char *suffix; pstring readName, spotGroup; static IlluminaSpot spot; /* look for spot group */ suffix = strchr(readId->data, '#'); if( suffix != NULL ) { readId->len = suffix++ - readId->data; if( (rc = pstring_assign(&spotGroup, suffix, strlen(suffix))) != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogInt, rc, "extracting barcode from spot '$(spotname)'", "spotname=%s", readId->data); return rc; } } else { pstring_clear(&spotGroup); } /* build the read name from prefix (self->name_prefix) and read id */ if(self->name_prefix.len > 0 ) { if( (rc = pstring_copy(&readName, &self->name_prefix)) == 0 ) { if( isdigit(readName.data[readName.len - 1]) ) { rc = pstring_append(&readName, ":", 1); } if( rc == 0 ) { rc = pstring_concat(&readName, readId); } } } else { rc = pstring_copy(&readName, readId); } if( rc != 0 ) { SRALoaderFile_LOG(self->ctx.file, klogErr, rc, "preparing spot name $(spotname)", "spotname=%s", readId->data); return rc; } SRF_set_read_filter(&self->read.filter, flags); IlluminaSpot_Init(&spot); if( (rc = IlluminaSpot_Add(&spot, &readName, &spotGroup, &self->read)) == 0 ) { rc = SRAWriterIllumina_Write(self->writer, self->ctx.file, &spot); } return rc; }
void AbsolidRead_Init(AbsolidRead* read) { if( read ) { pstring_clear(&read->label); read->cs_key = 0; pstring_clear(&read->seq); pstring_clear(&read->qual); read->fs_type = eAbisolidFSignalType_NotSet; pstring_clear(&read->fxx); pstring_clear(&read->cy3); pstring_clear(&read->txr); pstring_clear(&read->cy5); read->filter = 0; } }
static void AbsolidSpot_Init(AbsolidSpot* spot, const pstring* spot_name, const uint16_t nreads, const SRASegment* read_seg) { assert(spot != NULL); assert(spot_name != NULL); assert(nreads != 0); spot->spot_name = spot_name; pstring_clear(&spot->label); memmove(spot->read_seg, read_seg, sizeof(spot->read_seg[0]) * nreads); memset(spot->label_start, 0, sizeof(spot->label_start[0]) * nreads); memset(spot->label_len, 0, sizeof(spot->label_len[0]) * nreads); memset(spot->cs_key, 0, sizeof(spot->cs_key[0]) * nreads); pstring_clear(&spot->seq); pstring_clear(&spot->qual); spot->fs_type = eAbisolidFSignalType_NotSet; pstring_clear(&spot->fxx); pstring_clear(&spot->cy3); pstring_clear(&spot->txr); pstring_clear(&spot->cy5); memset(spot->filter, 0, sizeof(spot->filter[0]) * nreads); }
/* reads from a file data for a sinlge spot, data may be partial */ static rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file) { rc_t rc = 0; if( file->spot->ready ) { /* data still not used */ return 0; } FileReadData_init(file->spot, false); FileReadData_init(&file->spot[1], false); if( (rc = file_read_line(file, true)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data"); } else if( file->line == NULL ) { return 0; /* eof */ } if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) { /* single line forms */ file->line = NULL; /* line consumed */ file->spot->ready = true; } else if( file->line[0] == '>' || file->line[0] == '@' ) { /* 4 or 8 line format */ FileReadData sd; uint8_t word = 0, best_word = 0; uint8_t score = 0, best_score = 0; /* find and parse spot name on defline */ do { score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word); if( score > best_score ) { if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 || (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name"); } file->spot->read.read_id = sd.read.read_id; best_score = score; best_word = word; /* used below for quality defline parsing */ } } while(score != 0); if( best_score == 0 ) { rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected"); } if( file->line[0] == '@' ) { if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) { return rc; } /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */ if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) { /* try to find read id on next line */ FileReadData_init(&file->spot[1], false); if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) { if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 && pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 && file->spot->read.read_id != file->spot[1].read.read_id ) { /* since it is different read id with same name and barcode, fill up second read */ if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) { return rc; } } } } } else { /* 2 line seq or quality form */ file->line = NULL; /* line consumed */ /* read sequence/quality */ if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data"); } if( file->spot->read.seq.len == 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data"); } else if( !pstring_is_fasta(&file->spot->read.seq) ) { /* swap */ if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) { file->spot->read.qual_type = file->qualType; pstring_clear(&file->spot->read.seq); } } file->spot->ready = true; } } else { rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid); return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown"); } if( rc == 0 ) { int k; for(k = 0; k < 2; k++) { FileReadData* rd = &file->spot[k]; if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) { if( file->qualOffset == 0 ) { /* detect and remember */ file->qualOffset = 33; file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); if( GetRCState(rc) == rcOutofrange ) { file->qualOffset = 64; file->qualMax = 61; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } } else { if(file->qualOffset == 33) file->qualMax = 94; rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax); } if( rc != 0 ) { return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality"); } } } } return 0; }
/* parses name as a given word number (1-based) in a str of size len * looks for name(#barcode)?([\/.]\d)? * returns score of found parts * score == 0 word not found */ static uint8_t parse_spot_name(const SRALoaderFile* file, FileReadData* spot, const char* str, size_t len, uint8_t word_number) { uint8_t w, score = 0; const char* name, *name_end; name = name_end = str; /* set name_end to end of word_number-th word */ for(w = 1; w <= word_number || name_end == NULL; w++ ) { /* skip consecutive spaces */ while( *name_end == ' ' && name_end != &str[len] ) { name_end++; } name = name_end; name_end = memchr(name, ' ', len - (name_end - str)); if( name_end == NULL ) { if( w == word_number ) { name_end = &str[len]; } break; } } if( name != name_end && name_end != NULL ) { char* x; rc_t rc; /* init only name portion */ FileReadData_init(spot, true); --name_end; /* goto last char */ if( isdigit(name_end[0])&& (name_end[-1] == '\\' || name_end[-1] == '/' )) { score++; spot->read.read_id = name_end[0] - '0'; name_end -= 2; } else if( isdigit(*name_end) && name_end[-1] == '.' ) { int q = 0; if( memrchr(name, '#', name_end - name) != NULL ) { /* have barode -> this is read id */ q = 4; } else { /* may a read id, check to see if 4 coords follow */ const char* end = name_end - 1; while( --end >= name ) { if( strchr(":|_", *end) != NULL ) { q++; } else if( !isdigit(*end) ) { break; } } } if( q == 4 ) { score++; spot->read.read_id = name_end[0] - '0'; name_end -= 2; } } if( (x = memrchr(name, '#', name_end - name)) != NULL ) { score++; if( (rc = pstring_assign(&spot->barcode, x + 1, name_end - x)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "barcode $(b)", "b=%.*s", name_end - x, x + 1); return 0; } if( pstring_strcmp(&spot->barcode, "0") == 0 ) { pstring_clear(&spot->barcode); } else if( spot->barcode.len >= 4 && (strncmp(spot->barcode.data, "0/1_", 4) == 0 || strncmp(spot->barcode.data, "0/2_", 4) == 0) ) { spot->read.read_id = spot->barcode.data[2] - '0'; pstring_assign(&spot->barcode, &spot->barcode.data[4], spot->barcode.len - 4); } name_end = --x; } score++; if( (rc = pstring_assign(&spot->name, name, name_end - name + 1)) != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "spot name $(n)", "n=%.*s", name_end - name + 1, name); return 0; } /* search for _R\d\D in name and use it as read id, remove from name or spot won't assemble */ x = spot->name.data; while( (x = strrchr(x, 'R')) != NULL ) { if( x != spot->name.data && *(x - 1) == '_' && isdigit(*(x + 1)) && !isalnum(*(x + 2)) ) { score++; if(spot->read.read_id == -1){ spot->read.read_id = *(x + 1) - '0'; } strcpy(x - 1, x + 2); spot->name.len -= 4; break; } x++; } /* find last '=' and use only whatever is to the left of it */ if( (x = memrchr(spot->name.data, '=', spot->name.len)) != NULL ) { rc = pstring_assign(&spot->name, spot->name.data, (x - spot->name.data) ); } } return score; }
bool FGroup_Parse( SLNode *n, void *d ) { FGroup_Parse_data* data = (FGroup_Parse_data*)d; FGroup* g = (FGroup*)n; bool done; const SRALoaderFile* data_block_ref = NULL; data->rc = 0; do { IlluminaFileInfo* file = g->files; done = true; while( data->rc == 0 && file != NULL ) { if( (data->rc = read_next_spot(g->blk_pfx, file)) == 0 && file->ready ) { done = false; } file = file->next; } if( data->rc != 0 || done ) { break; } /* collect spot reads, matching by spot name * spot data may be split across multiple files */ IlluminaSpot_Init(&data->spot); file = g->files; while( data->rc == 0 && file != NULL ) { if( file->ready ) { if( (file->type == eIlluminaNativeFileTypeNoise && data->self->skip_noise) || (file->type == eIlluminaNativeFileTypeIntensity && data->self->skip_intensity) || (file->type == eIlluminaNativeFileTypeSignal && data->self->skip_signal) ) { file->ready = false; } else { data_block_ref = file->file; if( file->type == eIlluminaNativeFileTypeQSeq && (g->mask & eIlluminaNativeFileTypeQuality4) ) { /* drop quality1 from qseq data */ pstring_clear(&file->read.qual); } else if( file->type == eIlluminaNativeFileTypeQuality4 ) { IlluminaFileInfo* neib = file->next ? file->next : file->prev; /* need to fix spotname to be same cause prb do not have any name in it */ if( (data->rc = pstring_copy(&file->name, &neib->name)) != 0 ) { SRALoaderFile_LOG(file->file, klogErr, data->rc, "$(msg) '$(n)'", "msg=syncing prb spot name,n=%s", neib->name.data); } } if( data->rc == 0 ) { data->rc = IlluminaSpot_Add(&data->spot, &file->name, &file->barcode, &file->read); if( data->rc == 0 ) { file->ready = false; } else { if( GetRCState(data->rc) == rcIgnored ) { SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(s1)' <> '$(s2)'", "msg=spot name mismatch,s1=%.*s,s2=%.*s", data->spot.name->len, data->spot.name->data, file->name.len, file->name.data); data->self->spots_bad_count++; /* skip spot for all files in a group */ file = g->files; while( file != NULL ) { file->ready = false; SRALoaderFile_LOG(file->file, klogWarn, data->rc, "$(msg) '$(n)'", "msg=skipped spot,n=%s", file->name.data); file = file->next; } if( data->self->spots_bad_allowed >= 0 && data->self->spots_bad_count > data->self->spots_bad_allowed ) { data->rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid); } break; } } } } } file = file->next; } if( GetRCState(data->rc) == rcIgnored ) { data->rc = 0; continue; } if( data->rc == 0 ) { data->rc = SRAWriterIllumina_Write(data->self->writer, data_block_ref, &data->spot); } } while( data->rc == 0 ); return data->rc != 0; }
static rc_t SFFLoaderFmtReadData(SFFLoaderFmt* self, const SRALoaderFile* file) { rc_t rc = 0; uint32_t i; /* calc signal chunk size */ size_t signal_sz = self->header.num_flows_per_read * sizeof(uint16_t); /* plus position, read, quality */ size_t sz = signal_sz + self->read_header.number_of_bases * 3; /* + padding */ sz += (sz % 8) ? (8 - (sz % 8)) : 0; /* adjust the buffer window to full data block size */ if( (rc = SFFLoaderFmt_ReadBlock(self, file, sz, "read data", false)) != 0 ) { return rc; } self->file_advance = sz; if( !self->skip_signal ) { rc = pstring_assign(&self->signal, self->file_buf, signal_sz); #if __BYTE_ORDER == __LITTLE_ENDIAN if( rc == 0 ) { uint16_t* sig = (uint16_t*)self->signal.data; for(i = 0; i < self->header.num_flows_per_read; i++) { sig[i] = bswap_16(sig[i]); } } #endif } if( rc == 0 ) { const uint8_t* pos = self->file_buf + signal_sz; if( !self->skip_signal ) { INSDC_coord_one *p; /* reset buffer to proper size */ pstring_clear(&self->position); rc = pstring_append_chr(&self->position, 0, self->read_header.number_of_bases * sizeof(*p)); p = (INSDC_coord_one*)&self->position.data[0]; p[0] = pos[0]; for(i = 1; i < self->read_header.number_of_bases; i++) { p[i] = p[i - 1] + pos[i]; } } if( rc == 0 ) { pos += self->read_header.number_of_bases; rc = pstring_assign(&self->read, pos, self->read_header.number_of_bases); /*for(i = 0; i< self->read.len; i++ ) { self->read.data[i] = tolower(self->read.data[i]); }*/ } if( rc == 0 ) { pos += self->read_header.number_of_bases; rc = pstring_assign(&self->quality, pos, self->read_header.number_of_bases); } } if( rc != 0 ) { SRALoaderFile_LOG(file, klogErr, rc, "copying read data", NULL); } return rc; }