Exemple #1
0
static
rc_t read_spot_data_3lines(FastqFileInfo* file, FileReadData* sd, uint8_t best_word, uint8_t best_score, int qualType)
{
    rc_t rc = 0;

    file->line = NULL; /* discard defline */
    /* read sequence */
    if( (rc = read_multiline_seq_or_qual(file, '+', &sd->read.seq)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence data");
    }
    if( !pstring_is_fasta(&sd->read.seq) ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected sequence data");
    }
    /* next defline */
    if( (rc = file_read_line(file, false)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
    }
    if( file->line[0] != '+' ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '+' on quality defline");
    }
    if( file->line_len != 1 ) { /* there may be just '+' on quality defline */
        FileReadData d;
        uint8_t score = parse_spot_name(file->file, &d, &file->line[1], file->line_len - 1, best_word);
        /* sometimes quality defline may NOT contain barcode and readid, so score will be lower than bestscore,
           but must be at least == 1 with none empty line, which means that name was found */
        if( score < 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not found");
        }
        if( pstring_cmp(&sd->name, &d.name) != 0 ||
            (score == best_score && (pstring_cmp(&sd->barcode, &d.barcode) != 0 || sd->read.read_id != d.read.read_id)) ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality defline do not match sequence defline");
        }
    }
    file->line = NULL; /* discard defline */
    if( (rc = read_multiline_seq_or_qual(file, '@', &sd->read.qual)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=failed to read quality");
    }
    if( sd->read.qual.len <= 0 ) {
        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcEmpty);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=quality");
    }
    sd->read.qual_type = qualType;
    sd->ready = true;
    return 0;
}
Exemple #2
0
/*
 * in a single line form tries to grab last to chunks defined by sep into seq and qual
 * ignores spaces adjucent to sep
 * normally line would look like "name sep seq sep sep qual"
 */
static
bool find_seq_qual_by_sep(FastqLoaderFmt* self, FastqFileInfo* file, const char sep)
{
    const char* seq = NULL, *qual = NULL;
    size_t seq_len = 0, qual_len = 0;

    FileReadData_init(file->spot, false);
    qual = memrchr(file->line, sep, file->line_len);
    if( qual != NULL ) {
        seq = memrchr(file->line, sep, qual - file->line);
        if( seq != NULL ) {
            if( parse_spot_name(file->file, file->spot, file->line, seq - file->line, 1) != 0 ) {
                /* skip leading spaces */
                do {
                    seq = seq + 1;
                } while( *seq == ' ' && seq < (file->line + file->line_len) );
                seq_len = qual - seq;
                do {
                    qual = qual + 1;
                } while( *qual == ' ' && qual < (file->line + file->line_len)  );
                qual_len = file->line_len - (qual - file->line);
                if( *seq != sep && *seq != ' ' && seq_len != 0 &&
                    *qual != sep && *qual != ' ' && qual_len != 0 ) {
                    if( match_seq_to_qual(seq, seq_len, qual, qual_len) ) {
                        rc_t rc;
                        if( (rc = pstring_assign(&file->spot->read.seq, seq, seq_len)) == 0 ) {
                            if( pstring_is_fasta(&file->spot->read.seq) ) {
                                if( (rc = pstring_assign(&file->spot->read.qual, qual, qual_len)) == 0 ) {
                                    file->spot->read.qual_type = file->qualType;
                                    return true;
                                }
                            }
                            file->spot->read.seq.len = 0;
                        }
                        if( rc != 0 ) {
                            SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=storing read data");
                        }
                    }
                }
            }
        }
    }
    return false;
}
Exemple #3
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file)
{
    rc_t rc = 0;

    if( file->spot->ready ) {
        /* data still not used */
        return 0;
    }
    FileReadData_init(file->spot, false);
    FileReadData_init(&file->spot[1], false);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) {
        /* single line forms */
        file->line = NULL; /* line consumed */
        file->spot->ready = true;
    } else  if( file->line[0] == '>' || file->line[0] == '@' ) {
        /* 4 or 8 line format */
        FileReadData sd;
        uint8_t word = 0, best_word = 0;
        uint8_t score = 0, best_score = 0;
        /* find and parse spot name on defline */
        do {
            score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word);
            if( score > best_score ) {
                if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 ||
                    (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name");
                }
                file->spot->read.read_id = sd.read.read_id;
                best_score = score;
                best_word = word; /* used below for quality defline parsing */
            }

        } while(score != 0);
        if( best_score == 0 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected");
        }
        if( file->line[0] == '@' ) {
            if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) {
                return rc;
            }
            /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */
            if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) {
                /* try to find read id on next line */
                FileReadData_init(&file->spot[1], false);
                if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) {
                    if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 &&
                        pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 &&
                        file->spot->read.read_id != file->spot[1].read.read_id ) {
                        /* since it is different read id with same name and barcode, fill up second read */
                        if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) {
                            return rc;
                        }
                    }
                }
            }
        } else {
            /* 2 line seq or quality form */
            file->line = NULL; /* line consumed */
            /* read sequence/quality */
            if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data");
            }
            if( file->spot->read.seq.len == 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data");
            } else if( !pstring_is_fasta(&file->spot->read.seq) ) {
                /* swap */
                if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) {
                    file->spot->read.qual_type = file->qualType;
                    pstring_clear(&file->spot->read.seq);
                }
            }
            file->spot->ready = true;
        }
    } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown");
    }
    if( rc == 0 ) {
        int k;
        for(k = 0; k < 2; k++) {
            FileReadData* rd = &file->spot[k];
            if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
                if( file->qualOffset == 0 ) {
                    /* detect and remember */
                    file->qualOffset = 33;
		    file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    if( GetRCState(rc) == rcOutofrange ) {
                        file->qualOffset = 64;
			file->qualMax = 61;
                        rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    }
                } else {
		    if(file->qualOffset == 33) file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality");
                }
            }
        }
    }
    return 0;
}
Exemple #4
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(const char* blk_pfx, IlluminaFileInfo* file)
{
    rc_t rc = 0;
    const char* tail = file->line;

    if( file->ready ) {
        /* data still not used */
        return 0;
    }
    IlluminaFileInfo_init(file);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    switch( file->type ) {
        case eIlluminaNativeFileTypeQSeq:
            if( (rc = parse_qseq(file, file->line, file->line_len)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading qseq");
            }
            break;

        case eIlluminaNativeFileTypeFasta:
        case eIlluminaNativeFileTypeNoise:
        case eIlluminaNativeFileTypeIntensity:
        case eIlluminaNativeFileTypeSignal:
            {{
                /* read only common first 4 coords into name and prepend with DATA_BLOCK/@name */
                if( (rc = read_spot_coord(file, file->line, file->line_len, &tail)) == 0 ) {
                    if( blk_pfx != NULL ) {
                        pstring tmp_name;
                        if( (rc = pstring_copy(&tmp_name, &file->name)) == 0 &&
                            (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) == 0 &&
                            (rc = pstring_append(&file->name, ":", 1)) == 0 ) {
                            rc = pstring_concat(&file->name, &tmp_name);
                        }
                    }
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading spot coord");
                }
                break;
            }}

        case eIlluminaNativeFileTypeQuality4:
            if( (rc = read_quality(file->line, file->line_len, &file->read)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality");
            } else if( (rc = pstring_assign(&file->name, blk_pfx, strlen(blk_pfx))) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=name for quality 4");
            }
            break;

        default:
            rc = RC(rcSRA, rcFormatter, rcReading, rcFileFormat, rcUnknown);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=processing data line");
            break;
    }

    /* process tail (after coords) for some file types */
    file->line_len -= tail - file->line; /* length of tail */
    switch( file->type ) {
        case eIlluminaNativeFileTypeQSeq:
        case eIlluminaNativeFileTypeQuality4:
        default:
            /* completely processed before */
            break;

        case eIlluminaNativeFileTypeFasta:
            if( (rc = pstring_assign(&file->read.seq, tail, file->line_len)) != 0 ||
                !pstring_is_fasta(&file->read.seq) ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading fasta");
            }
            break;

        case eIlluminaNativeFileTypeNoise:
            if( (rc = read_signal(tail, file->line_len, &file->read.noise)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting noise");
            }
            break;

        case eIlluminaNativeFileTypeIntensity:
            if( (rc = read_signal(tail, file->line_len, &file->read.intensity)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting intensity");
            }
            break;

        case eIlluminaNativeFileTypeSignal:
            if( (rc = read_signal(tail, file->line_len, &file->read.signal)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting signal");
            }
            break;
    }
    file->line = NULL;
    file->ready = true;
#if _DEBUGGING
    DEBUG_MSG(3, ("name:'%s' [%li:%li:%li:%li]\n", file->name.data, 
                file->coord[0], file->coord[1], file->coord[2], file->coord[3]));
    if( file->read.seq.len ) {
        DEBUG_MSG(3, ("seq:'%.*s'\n", file->read.seq.len, file->read.seq.data));
    }
    if( file->read.qual.len ) {
        DEBUG_MSG(3, ("qual{0x%x}: %u bytes\n", file->read.qual_type, file->read.qual.len));
    }
#endif
    return 0;
}
Exemple #5
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file)
{
    rc_t rc = 0;

    if( file->ready ) {
        /* data still not used */
        return 0;
    }
    HelicosFileInfo_init(file);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( file->line[0] == '@' ) { /*** fastq format **/
        if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 ||
            file->line[0] != '+' || file->line_len != 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 ||
            (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 ||
            (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality");
        }
        file->line = NULL;
        file->ready = true;
    } else if( file->line[0] == '>' ) { /** fasta format **/
	if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
	if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
	file->line = NULL;
	file->quality.len = file->sequence.len;
	memset(file->quality.data,14,file->quality.len);
	file->ready = true;
    } else {
        rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'");
    }
#if _DEBUGGING
 DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len)); /*
    DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/
#endif
    return 0;
}