Beispiel #1
0
static void SR_AnchorInfoRead(SR_AnchorInfo* pInfo, FILE* libFile)
{
    unsigned int readSize = 0;
    
    readSize = fread(pInfo->pLength, sizeof(int32_t), pInfo->size, libFile);
    if (readSize != pInfo->size)
        SR_ErrQuit("ERROR: Cannot read the length of anchors from the library file.\n");

    readSize = fread(pInfo->pMd5s, sizeof(char), pInfo->size * MD5_STR_LEN, libFile);
    if (readSize != pInfo->size * MD5_STR_LEN)
        SR_ErrQuit("ERROR: Cannot read the md5 strings from the library file.\n");

    uint32_t anchorNameLen = 0;
    for (unsigned int i = 0; i != pInfo->size; ++i)
    {
        readSize = fread(&anchorNameLen, sizeof(uint32_t), 1, libFile);
        if (readSize != 1)
            SR_ErrQuit("ERROR: Cannot read the length of anchor name from the library file.\n");

        pInfo->pAnchors[i] = (char*) malloc((anchorNameLen + 1) * sizeof(char));
        if (pInfo->pAnchors[i] == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the anchor name.\n");

        pInfo->pAnchors[i][anchorNameLen] = '\0';
        
        readSize = fread(pInfo->pAnchors[i], sizeof(char), anchorNameLen, libFile);
        if (readSize != anchorNameLen)
            SR_ErrQuit("ERROR: Cannot read the anchor name from the library file.\n");

        int ret = 0;
        khiter_t khIter = kh_put(name, pInfo->pAnchorHash, pInfo->pAnchors[i], &ret);
        kh_value((khash_t(name)*) pInfo->pAnchorHash, khIter) = i;
    }
}
Beispiel #2
0
static void SR_SampleInfoRead(SR_SampleInfo* pInfo, FILE* libFile)
{
    unsigned int readSize = 0;
    uint32_t sampleNameLen = 0;

    for (unsigned int i = 0; i != pInfo->size; ++i)
    {
        readSize = fread(&sampleNameLen, sizeof(uint32_t), 1, libFile);
        if (readSize != 1)
            SR_ErrQuit("ERROR: Cannot read the length of sample name from the library file.\n");

        pInfo->pSamples[i] = (char*) malloc((sampleNameLen + 1) * sizeof(char));
        if (pInfo->pSamples[i] == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sample name.\n");

        pInfo->pSamples[i][sampleNameLen] = '\0';
        
        readSize = fread(pInfo->pSamples[i], sizeof(char), sampleNameLen, libFile);
        if (readSize != sampleNameLen)
            SR_ErrQuit("ERROR: Cannot read the sample name from the library file.\n");

        int ret = 0;
        khiter_t khIter = kh_put(name, pInfo->pSampleHash, pInfo->pSamples[i], &ret);
        kh_value((khash_t(name)*) pInfo->pSampleHash, khIter) = i;
    }
}
Beispiel #3
0
SR_AnchorInfo* SR_AnchorInfoAlloc(uint32_t capacity)
{
    SR_AnchorInfo* pNewInfo = (SR_AnchorInfo*) malloc(sizeof(SR_AnchorInfo));
    if (pNewInfo == NULL)
        SR_ErrQuit("ERROR: Not enough memory for an anchor information object.\n");

    pNewInfo->pAnchors = (char**) malloc(capacity * sizeof(char*));
    if (pNewInfo->pAnchors == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of anchor names in the anchor information object.\n");

    pNewInfo->pLength = (int32_t*) malloc(capacity * sizeof(int32_t));
    if (pNewInfo->pLength == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of anchor length in the anchor information object.\n");

    pNewInfo->pMd5s = (char*) malloc(capacity * MD5_STR_LEN * sizeof(char));
    if (pNewInfo->pMd5s == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of md5 string in the anchor information object.\n");

    pNewInfo->size = 0;
    pNewInfo->capacity = capacity;

    pNewInfo->pAnchorHash = kh_init(name);
    kh_resize(name, pNewInfo->pAnchorHash, 2 * capacity);

    return pNewInfo;
}
Beispiel #4
0
void SR_LibInfoTableWrite(const SR_LibInfoTable* pTable, FILE* libFile)
{
    unsigned int writeSize = 0;

    writeSize = fwrite(&(pTable->pAnchorInfo->size), sizeof(uint32_t), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the number of anchors into the information file.\n");

    writeSize = fwrite(&(pTable->pSampleInfo->size), sizeof(uint32_t), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the number of samples into the information file.\n");

    writeSize = fwrite(&(pTable->size), sizeof(uint32_t), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the number of read groups into the information file.\n");

    SR_AnchorInfoWrite(pTable->pAnchorInfo, libFile);
    SR_SampleInfoWrite(pTable->pSampleInfo, libFile);

    writeSize = fwrite(pTable->pSampleMap, sizeof(int32_t), pTable->size, libFile);
    if (writeSize != pTable->size)
        SR_ErrQuit("ERROR: Cannot write the read-group-to-sample map into the information file.\n");

    for (unsigned int i = 0; i != pTable->size; ++i)
    {
        uint32_t readGrpNameLen = strlen(pTable->pReadGrps[i]);
        writeSize = fwrite(&(readGrpNameLen), sizeof(uint32_t), 1, libFile);
        if (writeSize != 1)
            SR_ErrQuit("ERROR: Cannot write the readGrp name length into the information file.\n");

        writeSize = fwrite(pTable->pReadGrps[i], sizeof(char), readGrpNameLen, libFile);
        if (writeSize != readGrpNameLen)
            SR_ErrQuit("ERROR: Cannot write the readGrp name into the information file.\n");
    }

    writeSize = fwrite(&(pTable->fragLenMax), sizeof(uint32_t), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the maximum fragment length into the library file.\n");

    writeSize = fwrite(&(pTable->cutoff), sizeof(double), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the cutoff into the library information file.\n");

    writeSize = fwrite(&(pTable->trimRate), sizeof(double), 1, libFile);
    if (writeSize != 1)
        SR_ErrQuit("ERROR: Cannot write the trim rate into the library information file.\n");

    writeSize = fwrite(pTable->pLibInfo, sizeof(SR_LibInfo), pTable->size, libFile);
    if (writeSize != pTable->size)
        SR_ErrQuit("ERROR: Cannot write the library information into the output file.\n");

    fflush(libFile);
}
Beispiel #5
0
static SR_Status SR_LibInfoTableAddSample(int* pSampleID, SR_LibInfoTable* pTable, const char* tagPos, const char* lineEnd)
{
    const char* sampleNamePos = strstr(tagPos, "SM:");
    if (sampleNamePos != NULL && sampleNamePos < lineEnd)
        sampleNamePos += 3;

    const char* sampleNameEnd = strpbrk(sampleNamePos, " \t\n");
    int sampleNameLen = sampleNameEnd - sampleNamePos;

    SR_SampleInfo* pSampleInfo = pTable->pSampleInfo;

    if (sampleNameLen > 0)
    {
        char* buff = (char*) malloc((sampleNameLen + 1) * sizeof(char));
        if (buff == NULL)
            SR_ErrQuit("ERROR: Not enought memory for the read group ID in the fragment length distribution object.\n");

        buff[sampleNameLen] = '\0';
        memcpy(buff, sampleNamePos, sampleNameLen);

        int ret = 0;

        khash_t(name)* pSampleHash = pSampleInfo->pSampleHash;
        khiter_t khIter = kh_put(name, pSampleHash, buff, &ret);

        if (ret == 0)
        {
            free(buff);
            *pSampleID = kh_value(pSampleHash, khIter);
        }
        else
        {
            if (pSampleInfo->size == pSampleInfo->capacity)
            {
                pSampleInfo->capacity *= 2;
                pSampleInfo->pSamples = (char**) realloc(pSampleInfo->pSamples, pSampleInfo->capacity * sizeof(char*));
                if (pSampleInfo->pSamples == NULL)
                    SR_ErrQuit("ERROR: Not enought memory for the sample names in the fragment length distribution object.\n");
            }

            *pSampleID = pSampleInfo->size;
            kh_value(pSampleHash, khIter) = pSampleInfo->size;
            pSampleInfo->pSamples[pSampleInfo->size] = buff;
            ++(pSampleInfo->size);
        }

        return SR_OK;
    }

    return SR_ERR;
}
Beispiel #6
0
static void SR_SampleInfoWrite(const SR_SampleInfo* pInfo, FILE* libFile)
{
    unsigned int writeSize = 0;

    for (unsigned int i = 0; i != pInfo->size; ++i)
    {
        uint32_t sampleNameLen = strlen(pInfo->pSamples[i]);
        writeSize = fwrite(&(sampleNameLen), sizeof(uint32_t), 1, libFile);
        if (writeSize != 1)
            SR_ErrQuit("ERROR: Cannot write the sample name length into the information file.\n");

        writeSize = fwrite(pInfo->pSamples[i], sizeof(char), sampleNameLen, libFile);
        if (writeSize != sampleNameLen)
            SR_ErrQuit("ERROR: Cannot write the sample name into the information file.\n");
    }
}
Beispiel #7
0
// read the header of a bam file
SR_BamHeader* SR_BamInStreamLoadHeader(SR_BamInStream* pBamInStream)
{
    bam_header_t* pOrigHeader = bam_header_read(pBamInStream->fpBamInput);
    if (pOrigHeader == NULL)
        return NULL;

    SR_BamHeader* pBamHeader = SR_BamHeaderAlloc();

    pBamHeader->pOrigHeader = pOrigHeader;

    pBamHeader->pMD5s = (const char**) calloc(pOrigHeader->n_targets, sizeof(char*));
    if (pBamHeader->pMD5s == NULL)
        SR_ErrQuit("ERROR: Not enough memory for md5 string");

    unsigned int numMD5 = 0;
    for (const char* md5Pos = pOrigHeader->text; numMD5 <= pOrigHeader->n_targets && (md5Pos = strstr(md5Pos, "M5:")) != NULL; ++numMD5, ++md5Pos)
    {
        pBamHeader->pMD5s[numMD5] = md5Pos + 3;
    }

    if (numMD5 != pOrigHeader->n_targets)
    {
        free(pBamHeader->pMD5s);
        pBamHeader->pMD5s = NULL;

        if (numMD5 != 0)
            SR_ErrMsg("WARNING: Number of MD5 string is not consistent with number of chromosomes.");
    }

    return pBamHeader;
}
Beispiel #8
0
SR_BamHeader* SR_BamHeaderAlloc(void)
{
    SR_BamHeader* pNewHeader = (SR_BamHeader*) calloc(1, sizeof(SR_BamHeader));
    if (pNewHeader == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a bam header object");

    return pNewHeader;
}
Beispiel #9
0
SR_ReadPairAttrbtArray* SR_ReadPairAttrbtArrayAlloc(uint32_t numReadGrp)
{
    SR_ReadPairAttrbtArray* pAttrbtArray = (SR_ReadPairAttrbtArray*) malloc(sizeof(SR_ReadPairAttrbtArray));
    if (pAttrbtArray == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a read pair attribute array object.\n");

    pAttrbtArray->data = (SR_ReadPairAttrbt*) malloc(DEFAULT_RP_ATTRB_CAPACITY * sizeof(SR_ReadPairAttrbt));
    if (pAttrbtArray->data == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of the read pair attributes in the read pair attribute array object.\n");

    pAttrbtArray->pBoundaries = (double (*)[2]) malloc(sizeof(double) * 2 * numReadGrp);
    if (pAttrbtArray->pBoundaries == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of the boundaries in the read pair attribute array object.\n");

    pAttrbtArray->numReadGrp = numReadGrp;
    pAttrbtArray->size = 0;
    pAttrbtArray->capacity = DEFAULT_RP_ATTRB_CAPACITY;

    return pAttrbtArray;
}
Beispiel #10
0
SR_SampleInfo* SR_SampleInfoAlloc(uint32_t capacity)
{
    SR_SampleInfo* pNewInfo = (SR_SampleInfo*) malloc(sizeof(SR_SampleInfo));
    if (pNewInfo == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a sample information object.\n");

    pNewInfo->pSamples = (char**) malloc(capacity * sizeof(char*));
    if (pNewInfo->pSamples == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of sample names in the sample information object.\n");

    pNewInfo->pReadFraction = (double*) malloc(capacity * sizeof(double));
    if (pNewInfo->pReadFraction == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of read fraction in the sample information object.\n");

    pNewInfo->pSampleHash = kh_init(name);
    kh_resize(name, pNewInfo->pSampleHash, 2 * capacity);

    pNewInfo->size = 0;
    pNewInfo->capacity = capacity;

    return pNewInfo;
}
Beispiel #11
0
void SR_ReadPairAttrbtArrayReInit(SR_ReadPairAttrbtArray* pAttrbtArray, uint64_t newCapacity)
{
    pAttrbtArray->size = 0;

    if (newCapacity > pAttrbtArray->capacity)
    {
        free(pAttrbtArray->data);
        pAttrbtArray->data = (SR_ReadPairAttrbt*) malloc(newCapacity * sizeof(SR_ReadPairAttrbt));
        if (pAttrbtArray->data == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the read pair attributes in the read pair attribute array object.\n");

        pAttrbtArray->capacity = newCapacity;
    }
}
Beispiel #12
0
SR_LibInfoTable* SR_LibInfoTableAlloc(uint32_t capAnchor, uint32_t capSample, uint32_t capReadGrp)
{
    SR_LibInfoTable* pNewTable = (SR_LibInfoTable*) malloc(sizeof(SR_LibInfoTable));
    if (pNewTable == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a library information table object.\n");

    pNewTable->pSampleInfo = SR_SampleInfoAlloc(capSample);

    pNewTable->pLibInfo = (SR_LibInfo*) malloc(capReadGrp * sizeof(SR_LibInfo));
    if (pNewTable->pLibInfo == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of library information in an library table object.\n");

    pNewTable->pReadGrps = (char**) malloc(capReadGrp * sizeof(char*));
    if (pNewTable->pLibInfo == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of read group names in an library table object.\n");

    pNewTable->pSampleMap = (int32_t*) malloc(capReadGrp * sizeof(int32_t));
    if (pNewTable->pSampleMap == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of read-group-to-sample map in an library table object.\n");

    pNewTable->pSeqTech = (int8_t*) malloc(capReadGrp * sizeof(int8_t));
    if (pNewTable->pSeqTech == NULL)
        SR_ErrQuit("ERROR: Not enough memory for the storage of sequencing technologies in an library table object.\n");

    pNewTable->pAnchorInfo = SR_AnchorInfoAlloc(capAnchor);

    pNewTable->pReadGrpHash = kh_init(name);
    kh_resize(name, pNewTable->pReadGrpHash, 2 * capReadGrp);

    pNewTable->size = 0;
    pNewTable->capacity = capReadGrp;
    pNewTable->fragLenMax = 0;
    pNewTable->cutoff = 0.0;
    pNewTable->trimRate = 0.0;

    return pNewTable;
}
Beispiel #13
0
static void SR_AnchorInfoWrite(const SR_AnchorInfo* pInfo, FILE* libFile)
{
    unsigned int writeSize = 0;

    writeSize = fwrite(pInfo->pLength, sizeof(int32_t), pInfo->size, libFile);
    if (writeSize != pInfo->size)
        SR_ErrQuit("ERROR: Cannot write the length of anchors into the information file.\n");

    writeSize = fwrite(pInfo->pMd5s, sizeof(char), pInfo->size * MD5_STR_LEN, libFile);
    if (writeSize != pInfo->size * MD5_STR_LEN)
        SR_ErrQuit("ERROR: Cannot write the md5 string into the information file.\n");

    for (unsigned int i = 0; i != pInfo->size; ++i)
    {
        uint32_t anchorNameLen = strlen(pInfo->pAnchors[i]);
        writeSize = fwrite(&(anchorNameLen), sizeof(uint32_t), 1, libFile);
        if (writeSize != 1)
            SR_ErrQuit("ERROR: Cannot write the anchor name length into the information file.\n");

        writeSize = fwrite(pInfo->pAnchors[i], sizeof(char), anchorNameLen, libFile);
        if (writeSize != anchorNameLen)
            SR_ErrQuit("ERROR: Cannot write the sample name into the information file.\n");
    }
}
Beispiel #14
0
SR_Status SR_LibInfoTableSetRG(SR_LibInfoTable* pTable, unsigned int* oldSize, const SR_BamHeader* pBamHeader)
{
    SR_Status status = SR_OK;
    *oldSize = pTable->size;
    unsigned int oldCapacity = pTable->capacity;

    const char* tagPos = pBamHeader->pOrigHeader->text;
    status = SR_LibInfoTableAddAnchor(pTable, pBamHeader);
    if (status != SR_OK)
        return SR_ERR;

    while ((tagPos = strstr(tagPos, "@RG")) != NULL)
    {
        const char* lineEnd = strpbrk(tagPos, "\n");
        int32_t sampleID = 0;

        status = SR_LibInfoTableAddSample(&sampleID, pTable, tagPos, lineEnd);
        if (status != SR_OK)
        {
            SR_ErrMsg("ERROR: the \"SM\" field is not found under the read group tag in the bam header.\n");

            ++tagPos;
            continue;
        }

        status = SR_LibInfoTableAddReadGrp(pTable, tagPos, lineEnd, sampleID);
        if (status != SR_OK)
            SR_ErrMsg("ERROR: the \"ID\" field is required under the read group tag.\n");

        ++tagPos;
    }

    if (pTable->capacity > oldCapacity)
    {
        pTable->pLibInfo = (SR_LibInfo*) realloc(pTable->pLibInfo, sizeof(SR_LibInfo) * pTable->capacity);
        if (pTable->pLibInfo == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of library summary in the library information object.\n");
    }

    return SR_OK;
}
Beispiel #15
0
// Read the next bam record from the bam file and store it in pBamInStream->pNewNode
static inline int SR_BamInStreamLoadNext(SR_BamInStream* pBamInStream)
{
    if (pBamInStream->bam_cur_status < 0) return -1;

    // for the bam alignment array, if we need to expand its space
    // we have to initialize those newly created bam alignment 
    // and update the query name hash since the address of those
    // bam alignments are changed after expanding
    pBamInStream->pNewNode = SR_BamNodeAlloc(pBamInStream->pMemPool);
    if (pBamInStream->pNewNode == NULL)
        SR_ErrQuit("ERROR: Too many unpaired reads are stored in the memory. Please use smaller bin size or disable searching pair genomically.\n");
    
    int ret;
    if (pBamInStream->pBamIterator != NULL)
      ret = bam_iter_read(pBamInStream->fpBamInput, *(pBamInStream->pBamIterator), &(pBamInStream->pNewNode->alignment));
    else
      ret = bam_read1(pBamInStream->fpBamInput, &(pBamInStream->pNewNode->alignment));

    pBamInStream->bam_cur_status = ret;

    return ret;
}
Beispiel #16
0
SR_BamInStream* SR_BamInStreamAlloc(const char* bamFilename, uint32_t binLen, unsigned int numThreads, unsigned int buffCapacity, 
                                    unsigned int reportSize, const SR_StreamMode* pStreamMode)
{
    SR_BamInStream* pBamInStream = (SR_BamInStream*) calloc(1, sizeof(SR_BamInStream));
    if (pBamInStream == NULL)
        SR_ErrQuit("ERROR: Not enough memory for a bam input stream object.");

    pBamInStream->bam_cur_status = -1;

    pBamInStream->fpBamInput = bam_open(bamFilename, "r");
    if (pBamInStream->fpBamInput == NULL)
        SR_ErrQuit("ERROR: Cannot open bam file %s for reading.\n", bamFilename);

    if ((pStreamMode->controlFlag & SR_USE_BAM_INDEX) != 0)
    {
        pBamInStream->pBamIndex = bam_index_load(bamFilename);
	if (pBamInStream->pBamIndex == NULL) {
            SR_ErrMsg("WARNING: Cannot open bam index file for reading. Creating it......");
	    bam_index_build(bamFilename);
	    SR_ErrMsg("         The bam index is created.");
	    pBamInStream->pBamIndex = bam_index_load(bamFilename);
	}
    }

    pBamInStream->filterFunc = pStreamMode->filterFunc;
    pBamInStream->filterData = pStreamMode->filterData;
    pBamInStream->numThreads = numThreads;
    pBamInStream->reportSize = reportSize;
    pBamInStream->currRefID = NO_QUERY_YET;
    pBamInStream->currBinPos = NO_QUERY_YET;
    pBamInStream->binLen = binLen;
    pBamInStream->pNewNode = NULL;
    pBamInStream->pBamIterator = NULL;

    if (numThreads > 0)
    {
        pBamInStream->pRetLists = (SR_BamList*) calloc(numThreads, sizeof(SR_BamList));
        if (pBamInStream->pRetLists == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of retrun alignment lists in the bam input stream object.\n");

        pBamInStream->pAlgnTypes = (SR_AlgnType*) malloc(numThreads * reportSize * sizeof(SR_AlgnType));
        if (pBamInStream->pAlgnTypes == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of pair alignment type in the bam input stream object.\n");
    }
    else
    {
        pBamInStream->pRetLists = NULL;
        pBamInStream->pAlgnTypes = NULL;
        pBamInStream->reportSize = 0;
    }

    if ((pStreamMode->controlFlag & SR_PAIR_GENOMICALLY) == 0)
    {
        pBamInStream->pNameHashes[PREV_BIN] = kh_init(queryName);
        kh_resize(queryName, pBamInStream->pNameHashes[PREV_BIN], reportSize);
    }
    else
    {
        pBamInStream->pNameHashes[PREV_BIN] = NULL;
        pBamInStream->binLen = SR_MAX_BIN_LEN;
    }

    pBamInStream->pNameHashes[CURR_BIN] = kh_init(queryName);
    kh_resize(queryName, pBamInStream->pNameHashes[CURR_BIN], reportSize);

    pBamInStream->pMemPool = SR_BamMemPoolAlloc(buffCapacity);

    pBamInStream->bam_cur_status = 1;

    return pBamInStream;
}
Beispiel #17
0
SR_LibInfoTable* SR_LibInfoTableRead(FILE* libFile)
{
    unsigned int readSize = 0;
    uint32_t sizeAC = 0;
    uint32_t sizeSM = 0;
    uint32_t sizeRG = 0;

    readSize = fread(&sizeAC, sizeof(uint32_t), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the number of anchors from the library file.\n");

    readSize = fread(&sizeSM, sizeof(uint32_t), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the number of sample from the library file.\n");

    readSize = fread(&sizeRG, sizeof(uint32_t), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the number of read group from the library file.\n");

    SR_LibInfoTable* pTable = SR_LibInfoTableAlloc(sizeAC, sizeSM, sizeRG);

    pTable->pAnchorInfo->size = sizeAC;
    pTable->pSampleInfo->size = sizeSM;
    pTable->size = sizeRG;

    pTable->pAnchorInfo->capacity = sizeAC;
    pTable->pSampleInfo->capacity = sizeSM;
    pTable->capacity = sizeRG;

    SR_AnchorInfoRead(pTable->pAnchorInfo, libFile);
    SR_SampleInfoRead(pTable->pSampleInfo, libFile);

    readSize = fread(pTable->pSampleMap, sizeof(int32_t), sizeRG, libFile);
    if (readSize != sizeRG)
        SR_ErrQuit("ERROR: Cannot read the read-group-to-sample map from the library file.\n");

    uint32_t readGrpNameLen = 0;

    for (unsigned int i = 0; i != sizeRG; ++i)
    {
        readSize = fread(&readGrpNameLen, sizeof(uint32_t), 1, libFile);
        if (readSize != 1)
            SR_ErrQuit("ERROR: Cannot read the length of read group name from the library file.\n");

        pTable->pReadGrps[i] = (char*) malloc((readGrpNameLen + 1) * sizeof(char));
        if (pTable->pReadGrps[i] == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sample name.\n");

        pTable->pReadGrps[i][readGrpNameLen] = '\0';

        readSize = fread(pTable->pReadGrps[i], sizeof(char), readGrpNameLen, libFile);
        if (readSize != readGrpNameLen)
            SR_ErrQuit("ERROR: Cannot read the read group name from the library file.\n");

        int ret = 0;
        khiter_t khIter = kh_put(name, pTable->pReadGrpHash, pTable->pReadGrps[i], &ret);
        kh_value((khash_t(name)*) pTable->pReadGrpHash, khIter) = i;
    }

    readSize = fread(&(pTable->fragLenMax), sizeof(uint32_t), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the maximum fragment length from the library file.\n");

    readSize = fread(&(pTable->cutoff), sizeof(double), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the cutoff from the library file.\n");

    readSize = fread(&(pTable->trimRate), sizeof(double), 1, libFile);
    if (readSize != 1)
        SR_ErrQuit("ERROR: Cannot read the trim rate from the library file.\n");

    readSize = fread(pTable->pLibInfo, sizeof(SR_LibInfo), pTable->size, libFile);
    if (readSize != pTable->size)
        SR_ErrQuit("ERROR: Cannot read the library information from the library file.\n");

    return pTable;
}
Beispiel #18
0
static SR_Status SR_LibInfoTableAddAnchor(SR_LibInfoTable* pTable, const SR_BamHeader* pBamHeader)
{
    const SR_Bool isLoaded = pTable->pAnchorInfo->size == 0 ? FALSE : TRUE;

    int ret = 0;
    khiter_t khIter = 0;

    if (isLoaded)
    {
        if (pBamHeader->pOrigHeader->n_targets != pTable->pAnchorInfo->size)
        {
            SR_ErrMsg("ERROR: The number of reference sequences in this bam file is inconsistent with that in previous bam files.\n");
            return SR_ERR;
        }

        unsigned int refIndex = 0;
        for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i)
        {
            khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pBamHeader->pOrigHeader->target_name[i], &ret);
            if (ret == 0)
            {
                khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash;
                refIndex = kh_value(pAnchorHash, khIter);
                if (refIndex != i)
                {
                    SR_ErrMsg("ERROR: Reference ID in this bam file is inconsistent with that in the previous bam files.\n");
                    return SR_ERR;
                }

                if (pTable->pAnchorInfo->pLength[i] > 0 && pTable->pAnchorInfo->pLength[i] != pBamHeader->pOrigHeader->target_len[i])
                {
                    SR_ErrMsg("ERROR: The length of the reference sequence in this bam file is inconsistent with that in previous bam files.\n");
                    return SR_ERR;
                }
            }
            else
            {
                SR_ErrMsg("ERROR: Found a reference sequence that is not in the previous bam headers.\n");
                return SR_ERR;
            }

            if (strncmp(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN) != 0)
            {
                SR_ErrMsg("ERROR: MD5 string in this bam file is inconsistent with that in previous bam files.\n");
                return SR_ERR;
            }
        }
    }
    else
    {
        if (pBamHeader->pOrigHeader->n_targets > pTable->pAnchorInfo->capacity)
        {
            SR_AnchorInfoFree(pTable->pAnchorInfo);
            pTable->pAnchorInfo = SR_AnchorInfoAlloc(pBamHeader->pOrigHeader->n_targets);
        }

        pTable->pAnchorInfo->size = pBamHeader->pOrigHeader->n_targets;

        for (unsigned int i = 0; i != pBamHeader->pOrigHeader->n_targets; ++i)
        {
            unsigned int nameLen = strlen(pBamHeader->pOrigHeader->target_name[i]);

            pTable->pAnchorInfo->pAnchors[i] = (char*) calloc(nameLen + 1, sizeof(char));
            if (pTable->pAnchorInfo->pAnchors[i] == NULL)
                SR_ErrQuit("ERROR: Not enough memory for the storage of reference name.\n");

            strncpy(pTable->pAnchorInfo->pAnchors[i], pBamHeader->pOrigHeader->target_name[i], nameLen);
            khIter = kh_put(name, pTable->pAnchorInfo->pAnchorHash, pTable->pAnchorInfo->pAnchors[i], &ret);

            khash_t(name)* pAnchorHash = pTable->pAnchorInfo->pAnchorHash;
            kh_value(pAnchorHash, khIter) = i;

            // set the length of those unused references to -1 so that
            // we will ignore any reads aligned to them
            pTable->pAnchorInfo->pLength[i] = pBamHeader->pOrigHeader->target_len[i];
            if (strncmp("GL0", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("NC_", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("NT_", pTable->pAnchorInfo->pAnchors[i], 3) == 0
                || strncmp("hs", pTable->pAnchorInfo->pAnchors[i], 2) == 0)
            {
                pTable->pAnchorInfo->pLength[i] = -1;
            }

            strncpy(pTable->pAnchorInfo->pMd5s + MD5_STR_LEN * i, pBamHeader->pMD5s[i], MD5_STR_LEN);
        }
    }

    return SR_OK;
}
Beispiel #19
0
static SR_Status SR_LibInfoTableAddReadGrp(SR_LibInfoTable* pTable, const char* tagPos, const char* lineEnd, int sampleID)
{
    // get the name of the current read group
    const char* readGrpNamePos = strstr(tagPos, "ID:");
    if (readGrpNamePos != NULL && readGrpNamePos < lineEnd)
        readGrpNamePos += 3;
    else
        return SR_ERR;

    const char* platformPos = strstr(tagPos, "PL:");
    if (platformPos != NULL && platformPos < lineEnd)
        platformPos += 3;
    else
        return SR_ERR;

    // expand the array if necessary
    if (pTable->size == pTable->capacity)
    {
        pTable->capacity *= 2;
        pTable->pReadGrps = (char**) realloc(pTable->pReadGrps, pTable->capacity * sizeof(char*));
        if (pTable->pReadGrps == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of read group names in the library information object.\n");

        pTable->pSampleMap = (int32_t*) realloc(pTable->pSampleMap, pTable->capacity * sizeof(int32_t));
        if (pTable->pSampleMap == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sample ID map in the library information object.\n");

        pTable->pSeqTech = (int8_t*) realloc(pTable->pSeqTech, pTable->capacity * sizeof(int8_t));
        if (pTable->pSeqTech == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of the sequencing technology in the library information  object.\n");
    }

    SR_Status status = SR_LibInfoTableAddSeqTech(pTable, platformPos);
    if (status != SR_OK)
        return SR_ERR;

    const char* readGrpNameEnd = strpbrk(readGrpNamePos, " \t\n\0");
    size_t readGrpNameLen = readGrpNameEnd - readGrpNamePos;
    if (readGrpNameLen > 0)
    {
        pTable->pReadGrps[pTable->size] = (char*) calloc(readGrpNameLen + 1, sizeof(char));
        if (pTable->pReadGrps[pTable->size] == NULL)
            SR_ErrQuit("ERROR: Not enough memory for the storage of read group name in the fragment length distribution object.\n");

        memcpy(pTable->pReadGrps[pTable->size], readGrpNamePos, readGrpNameLen);

        int ret = 0;

        khash_t(name)* pReadGrpHash = pTable->pReadGrpHash;
        khiter_t khIter = kh_put(name, pReadGrpHash, pTable->pReadGrps[pTable->size], &ret);

        if (ret != 0)
        {
            pTable->pSampleMap[pTable->size] = sampleID;
            kh_value(pReadGrpHash, khIter) = pTable->size;
            ++(pTable->size);

            return SR_OK;
        }
        else
        {
            free(pTable->pReadGrps[pTable->size]);
            pTable->pReadGrps[pTable->size] = NULL;
            SR_ErrMsg("ERROR: Found a duplicated read group ID.\n");
        }
    }

    return SR_ERR;
}