Пример #1
0
static
rc_t read_quality(const char* data, size_t data_sz, IlluminaRead* read)
{
    rc_t rc = 0;

    if( (rc = pstring_assign(&read->qual, data, data_sz)) == 0 ) {
        if( (rc = pstring_quality_convert(&read->qual, eExperimentQualityEncoding_Decimal, 0, -128, 127)) == 0 ) {
            read->qual_type = ILLUMINAWRITER_COLMASK_QUALITY_LOGODDS4;
        }
    }
    return rc;
}
Пример #2
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(FastqLoaderFmt* self, FastqFileInfo* file)
{
    rc_t rc = 0;

    if( file->spot->ready ) {
        /* data still not used */
        return 0;
    }
    FileReadData_init(file->spot, false);
    FileReadData_init(&file->spot[1], false);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( find_seq_qual_by_sep(self, file, ':') || find_seq_qual_by_sep(self, file, ' ') ) {
        /* single line forms */
        file->line = NULL; /* line consumed */
        file->spot->ready = true;
    } else  if( file->line[0] == '>' || file->line[0] == '@' ) {
        /* 4 or 8 line format */
        FileReadData sd;
        uint8_t word = 0, best_word = 0;
        uint8_t score = 0, best_score = 0;
        /* find and parse spot name on defline */
        do {
            score = parse_spot_name(file->file, &sd, &file->line[1], file->line_len - 1, ++word);
            if( score > best_score ) {
                if( (rc = pstring_copy(&file->spot->name, &sd.name)) != 0 ||
                    (rc = pstring_copy(&file->spot->barcode, &sd.barcode)) != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=copying read name");
                }
                file->spot->read.read_id = sd.read.read_id;
                best_score = score;
                best_word = word; /* used below for quality defline parsing */
            }

        } while(score != 0);
        if( best_score == 0 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcId, rcNotFound);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=spot name not detected");
        }
        if( file->line[0] == '@' ) {
            if( (rc = read_spot_data_3lines(file, file->spot, best_word, best_score, file->qualType)) != 0 ) {
                return rc;
            }
            /* now we MAY have 5th line in buffer so we can check if it's 2nd read for 8 lines format */
            if( file->line_len != 0 && file->line != NULL && file->line[0] == '@' ) {
                /* try to find read id on next line */
                FileReadData_init(&file->spot[1], false);
                if( parse_spot_name(file->file, &file->spot[1], &file->line[1], file->line_len - 1, best_word) == best_score ) {
                    if( pstring_cmp(&file->spot->name, &file->spot[1].name) == 0 &&
                        pstring_cmp(&file->spot->barcode, &file->spot[1].barcode) == 0 &&
                        file->spot->read.read_id != file->spot[1].read.read_id ) {
                        /* since it is different read id with same name and barcode, fill up second read */
                        if( (rc = read_spot_data_3lines(file, &file->spot[1], best_word, best_score, file->qualType)) != 0 ) {
                            return rc;
                        }
                    }
                }
            }
        } else {
            /* 2 line seq or quality form */
            file->line = NULL; /* line consumed */
            /* read sequence/quality */
            if( (rc = read_multiline_seq_or_qual(file, '>', &file->spot->read.seq)) != 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading seq/qual data");
            }
            if( file->spot->read.seq.len == 0 ) {
                return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=empty string reading seq/qual data");
            } else if( !pstring_is_fasta(&file->spot->read.seq) ) {
                /* swap */
                if( (rc = pstring_copy(&file->spot->read.qual, &file->spot->read.seq)) == 0 ) {
                    file->spot->read.qual_type = file->qualType;
                    pstring_clear(&file->spot->read.seq);
                }
            }
            file->spot->ready = true;
        }
    } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=file corrupt or format unknown");
    }
    if( rc == 0 ) {
        int k;
        for(k = 0; k < 2; k++) {
            FileReadData* rd = &file->spot[k];
            if( rd->ready && rd->read.qual_type != ILLUMINAWRITER_COLMASK_NOTSET ) {
                if( file->qualOffset == 0 ) {
                    /* detect and remember */
                    file->qualOffset = 33;
		    file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    if( GetRCState(rc) == rcOutofrange ) {
                        file->qualOffset = 64;
			file->qualMax = 61;
                        rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                    }
                } else {
		    if(file->qualOffset == 33) file->qualMax = 94;
                    rc = pstring_quality_convert(&rd->read.qual, file->qualEnc, file->qualOffset, file->qualMin, file->qualMax);
                }
                if( rc != 0 ) {
                    return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=converting quality");
                }
            }
        }
    }
    return 0;
}
Пример #3
0
/*
 * assumes tab separated file:
 * first 2 postiions concatinated with "_" into spot prefix
 * nextg 4 postiions concatinated with ":" into spot id: lane:tile:x:y
 * 7th (index) ignored
 * 8th is read id
 * 9th fasta
 * 10th quality
 * 11th (optional) read filter
 */
static
rc_t parse_qseq(IlluminaFileInfo* file, const char* data, size_t data_sz)
{
    rc_t rc = 0;
    const char* t, *str = data, *end = data + data_sz;
    int tabs = 0;
    do {
        if( (t = memchr(str, '\t', end - str)) != NULL ) {
            switch(++tabs) {
                case 1:
                    rc = pstring_assign(&file->name, str, t - str);
                    break;
                case 2:
                    if( (rc = pstring_append(&file->name, "_", 1)) == 0 ) {
                        rc = pstring_append(&file->name, str, t - str);
                    }
                    break;
                case 3:
                case 4:
                case 5:
                case 6:
                    errno = 0;
                    file->coord[tabs - 3] = strtol(str, NULL, 10);
                    if( errno != 0 ) {
                        file->coord[tabs - 3] = 0;
                    }
                    if( (rc = pstring_append(&file->name, ":", 1)) == 0 ) {
                        rc = pstring_append(&file->name, str, t - str);
                    }
                    break;
                case 7:
                    if( t - str != 1 || (*str != '0' && *str != '1') ) {
                        rc = pstring_assign(&file->barcode, str, t - str);
                    }
                    break;
                case 8:
                    if( t - str != 1 || !isdigit(*str) ) {
                        rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
                    } else {
                        file->read.read_id = *str - '0';
                        if( file->read.read_id == 0 ) {
                            file->read.read_id = ILLUMINAWRITER_READID_NONE;
                        }
                    }
                    break;
                case 9:
                    rc = pstring_assign(&file->read.seq, str, t - str);
                    break;
                case 10:
                    file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED;
                    rc = pstring_assign(&file->read.qual, str, t - str);
                    break;
            }
            str = ++t;
        }
    } while( rc == 0 && t != NULL && str < end );

    if( rc == 0 ) {
        if( tabs == 9 ) {
            file->read.qual_type = ILLUMINAWRITER_COLMASK_QUALITY_PHRED;
            rc = pstring_assign(&file->read.qual, str, end - str);
        } else if( tabs == 10 ) {
            if( end - str != 1 ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
            } else if( *str == '1' ) {
                file->read.filter = SRA_READ_FILTER_PASS;
            } else if( *str == '0' ) {
                file->read.filter = SRA_READ_FILTER_REJECT;
            } else {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
            }
        } else {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInvalid);
        }
        if( rc == 0 ) {
            if( file->read.seq.len != file->read.qual.len ) {
                rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcInconsistent);
            } else {
                rc = pstring_quality_convert(&file->read.qual, eExperimentQualityEncoding_Ascii, 64, 0, 0x7F);
            }
        }
    }
    return rc;
}
Пример #4
0
/* reads from a file data for a sinlge spot, data may be partial */
static
rc_t read_next_spot(HelicosLoaderFmt* self, HelicosFileInfo* file)
{
    rc_t rc = 0;

    if( file->ready ) {
        /* data still not used */
        return 0;
    }
    HelicosFileInfo_init(file);
    if( (rc = file_read_line(file, true)) != 0 ) {
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading more data");
    } else if( file->line == NULL ) {
        return 0; /* eof */
    }
    if( file->line[0] == '@' ) { /*** fastq format **/
        if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 ||
            file->line[0] != '+' || file->line_len != 1 ) {
            rc = RC(rcSRA, rcFormatter, rcReading, rcData, rcCorrupt);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality defline");
        }
        file->line = NULL;
        if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->quality.data)-1 ||
            (rc = pstring_assign(&file->quality, file->line, file->line_len)) != 0 ||
            (rc = pstring_quality_convert(&file->quality, eExperimentQualityEncoding_Ascii, 33, 0, 0x7F)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading quality");
        }
        file->line = NULL;
        file->ready = true;
    } else if( file->line[0] == '>' ) { /** fasta format **/
	if( (rc = pstring_assign(&file->name, &file->line[1], file->line_len - 1)) != 0 ) {
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading name");
        }
        file->line = NULL;
	if( (rc = file_read_line(file, false)) != 0 || file->line_len > sizeof(file->sequence.data)-1 ||
            (rc = pstring_assign(&file->sequence, file->line, file->line_len)) != 0 ||
            !pstring_is_fasta(&file->sequence) ) {
            rc = rc ? rc : RC(rcSRA, rcFormatter, rcReading, rcData, rcUnrecognized);
            return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=reading sequence");
        }
	file->line = NULL;
	file->quality.len = file->sequence.len;
	memset(file->quality.data,14,file->quality.len);
	file->ready = true;
    } else {
        rc = RC(rcSRA, rcFormatter, rcReading, rcFile, rcInvalid);
        return SRALoaderFile_LOG(file->file, klogErr, rc, "$(msg)", "msg=expected '@'");
    }
#if _DEBUGGING
 DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len)); /*
    DEBUG_MSG(3, ("READ: name:'%s', seq[%u]:'%s', qual[%u]:'%s'\n", file->name.data,
                file->sequence.len, file->sequence.data, file->quality.len, file->quality.data));*/
#endif
    return 0;
}